In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.ops import nearest_points

import warnings
warnings.filterwarnings("ignore")

import os
GEODATA_PATH = 'data/geodata/'
print(os.listdir(GEODATA_PATH))

TABULAR_PATH = 'data/tabular/'
data_files = os.listdir(TABULAR_PATH)
data_files

['AdministrativeBoundariesBARMMBarangays20190206PSA2016', 'InfraRoadsARMMOSM2019', 'SchoolsARMMDEPED2017']


['Condition(State of Repair)oftheBuilding(ARMM, 2010)Brgy.xlsx',
 'Construction Materials of the Roof (ARMM 2015)Brgy.xlsx',
 'Construction Materials of the Walls(ARMM 2015).xlsx',
 'Floor Area (ARMM, 2010) Brgy.xlsx',
 'Household Population by Ethnicity (ARMM, 2010) Brgy.xlsx',
 'Main Source of Water Supply for Cooking (ARMM 2015) Brgy.xlsx',
 'Main Source of Water Supply for Drinking (ARMM 2015) Brgy.xlsx',
 'Number of Households by Kind of Fuel for Lighting (ARMM 2015) Brgy.xlsx',
 'Number of schools per brgy.xlsx',
 'Population 2010, 2015 ARMM,BCT.xlsx',
 'Tenure Status of the Lot (ARMM 2010) Brgy.xlsx',
 'Type of Building (ARMM 2015) Brgy.xlsx',
 'Year Built (ARMM 2010) Brgy.xlsx']

In [2]:
def get_sex_age_cols(sex="bot",min_age=7,max_age=25):
    sex_single_age_cols = [col for col in population_df.columns if (col.split("_")[-2:][0]==sex)]
    student_age_cols = [col for col in sex_single_age_cols if (int(col.split("_")[-1])>=min_age) \
                        & (int(col.split("_")[-1])<=max_age)]
    return student_age_cols

In [3]:
population_df = pd.read_excel(TABULAR_PATH+'Population 2010, 2015 ARMM,BCT.xlsx')
cond = population_df['PSGC_REGI']=='PH150000000'
population_df=population_df[cond].reset_index(drop=True)
population_df.head()

Unnamed: 0,REGI,PSGC_REGI,PROV,PSGC_PROV,MuniCities,PSGC_CITY/MUNI,BrgyName,PSGC_BRGY,BCT,Pop2010,...,80_abv_mal,num_hh0_fem.1,bel_20_hh0_fem,20-29_hh0_fem,30-39_hh0_fem,40-49_hh0_fem,50-59_hh0_fem,60-69_hh0_fem,70-79_hh0_fem,80_abv_fem
0,Autonomous Region In Muslim Mindanao,PH150000000,Basilan,PH150700000,City Of Lamitan,PH150702000,Arco,PH150702001,1,800,...,3.0,23.0,0.0,1.0,3.0,0.0,8.0,6.0,2.0,3.0
1,Autonomous Region In Muslim Mindanao,PH150000000,Basilan,PH150700000,City Of Lamitan,PH150702000,Ba-as,PH150702002,1,1266,...,3.0,72.0,0.0,9.0,13.0,16.0,13.0,14.0,7.0,0.0
2,Autonomous Region In Muslim Mindanao,PH150000000,Basilan,PH150700000,City Of Lamitan,PH150702000,Baimbing,PH150702004,1,451,...,0.0,25.0,0.0,4.0,1.0,3.0,4.0,8.0,4.0,1.0
3,Autonomous Region In Muslim Mindanao,PH150000000,Basilan,PH150700000,City Of Lamitan,PH150702000,Balagtasan,PH150702005,1,2597,...,6.0,76.0,1.0,2.0,10.0,18.0,24.0,7.0,13.0,1.0
4,Autonomous Region In Muslim Mindanao,PH150000000,Basilan,PH150700000,City Of Lamitan,PH150702000,Balas,PH150702006,1,2809,...,2.0,166.0,2.0,26.0,39.0,28.0,32.0,28.0,7.0,4.0


In [4]:
both_student_age_cols = get_sex_age_cols()
male_student_age_cols = get_sex_age_cols(sex="mal")
female_student_age_cols = get_sex_age_cols(sex="fem")

student_population_df = population_df[["PSGC_BRGY"]+both_student_age_cols+male_student_age_cols+female_student_age_cols]
student_population_df["both_student_age_7_to_25"] = student_population_df.loc[:,both_student_age_cols].sum(axis=1)
student_population_df["male_student_age_7_to_25"] = student_population_df.loc[:,male_student_age_cols].sum(axis=1)
student_population_df["female_student_age_7_to_25"] = student_population_df.loc[:,female_student_age_cols].sum(axis=1)

student_population_df.drop(columns=both_student_age_cols+male_student_age_cols+female_student_age_cols,inplace=True)
print(student_population_df.shape)
student_population_df.head()

(2482, 4)


Unnamed: 0,PSGC_BRGY,both_student_age_7_to_25,male_student_age_7_to_25,female_student_age_7_to_25
0,PH150702001,250.0,134.0,116.0
1,PH150702002,497.0,247.0,250.0
2,PH150702004,237.0,112.0,125.0
3,PH150702005,953.0,489.0,464.0
4,PH150702006,1277.0,700.0,577.0


In [5]:
schools_df = pd.read_excel(TABULAR_PATH+'Number of schools per brgy.xlsx',usecols=["PSGC_BRGY","Number of School"])
schools_df.rename(columns={'Number of School':'No_of_Schools'},inplace=True)
print(schools_df.shape)
schools_df.head()

(1570, 2)


Unnamed: 0,PSGC_BRGY,No_of_Schools
0,PH150702001,1
1,PH150702002,1
2,PH150702005,1
3,PH150702006,1
4,PH150702007,1


In [6]:
education_df = pd.merge(student_population_df,schools_df,on="PSGC_BRGY",how="left")
print(education_df.shape)
education_df.head()

(2482, 5)


Unnamed: 0,PSGC_BRGY,both_student_age_7_to_25,male_student_age_7_to_25,female_student_age_7_to_25,No_of_Schools
0,PH150702001,250.0,134.0,116.0,1.0
1,PH150702002,497.0,247.0,250.0,1.0
2,PH150702004,237.0,112.0,125.0,
3,PH150702005,953.0,489.0,464.0,1.0
4,PH150702006,1277.0,700.0,577.0,1.0


In [7]:
source_light_df = pd.read_excel(TABULAR_PATH+'Number of Households by Kind of Fuel for Lighting (ARMM 2015) Brgy.xlsx')
source_light_df["Perc_Households_Electricity_Light"] = \
            source_light_df["Electricity"]/source_light_df["Number of HH"]
source_light_df["Perc_Households_Kerosene_Gass_Light"] = \
            source_light_df["Kerosene (Gaas)"]/source_light_df["Number of HH"]
source_light_df = source_light_df[["PSGC","Perc_Households_Electricity_Light","Perc_Households_Kerosene_Gass_Light"]]
print(source_light_df.shape)
source_light_df.head()

(2490, 3)


Unnamed: 0,PSGC,Perc_Households_Electricity_Light,Perc_Households_Kerosene_Gass_Light
0,PH150702001,0.65625,0.34375
1,PH150702002,0.792208,0.207792
2,PH150702004,0.39604,0.594059
3,PH150702005,0.870968,0.126728
4,PH150702006,0.71978,0.236264


In [8]:
source_drinking_water_df = pd.read_excel(TABULAR_PATH+'Main Source of Water Supply for Drinking (ARMM 2015) Brgy.xlsx')
source_drinking_water_df["Perc_Households_Drinking_Owned_Faucet"] = \
            source_drinking_water_df["Own use faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df["Perc_Households_Drinking_Shared_Faucet"] = \
            source_drinking_water_df["Shared faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df = source_drinking_water_df[["PSGC","Perc_Households_Drinking_Owned_Faucet","Perc_Households_Drinking_Shared_Faucet"]]
print(source_drinking_water_df.shape)
source_drinking_water_df.head()

(2490, 3)


Unnamed: 0,PSGC,Perc_Households_Drinking_Owned_Faucet,Perc_Households_Drinking_Shared_Faucet
0,PH150702001,0.05,0.41875
1,PH150702002,0.051948,0.008658
2,PH150702004,0.019802,0.138614
3,PH150702005,0.099078,0.730415
4,PH150702006,0.001832,0.012821


In [9]:
sdg_df = pd.merge(source_light_df,source_drinking_water_df,on="PSGC",how="left")
print(sdg_df.shape)
sdg_df.head()

(2490, 5)


Unnamed: 0,PSGC,Perc_Households_Electricity_Light,Perc_Households_Kerosene_Gass_Light,Perc_Households_Drinking_Owned_Faucet,Perc_Households_Drinking_Shared_Faucet
0,PH150702001,0.65625,0.34375,0.05,0.41875
1,PH150702002,0.792208,0.207792,0.051948,0.008658
2,PH150702004,0.39604,0.594059,0.019802,0.138614
3,PH150702005,0.870968,0.126728,0.099078,0.730415
4,PH150702006,0.71978,0.236264,0.001832,0.012821


In [10]:
education_sdg_df = pd.merge(education_df,sdg_df,left_on="PSGC_BRGY",right_on="PSGC",how="left")
education_sdg_df.drop(columns="PSGC",inplace=True)
print(education_sdg_df.shape)
education_sdg_df.head().T

(2482, 9)


Unnamed: 0,0,1,2,3,4
PSGC_BRGY,PH150702001,PH150702002,PH150702004,PH150702005,PH150702006
both_student_age_7_to_25,250,497,237,953,1277
male_student_age_7_to_25,134,247,112,489,700
female_student_age_7_to_25,116,250,125,464,577
No_of_Schools,1,1,,1,1
Perc_Households_Electricity_Light,0.65625,0.792208,0.39604,0.870968,0.71978
Perc_Households_Kerosene_Gass_Light,0.34375,0.207792,0.594059,0.126728,0.236264
Perc_Households_Drinking_Owned_Faucet,0.05,0.0519481,0.019802,0.0990783,0.0018315
Perc_Households_Drinking_Shared_Faucet,0.41875,0.00865801,0.138614,0.730415,0.0128205


In [11]:
brgys_geodata = gpd.read_file(GEODATA_PATH+'AdministrativeBoundariesBARMMBarangays20190206PSA2016/AdministrativeBoundariesBARMMBarangays20190206PSA2016.shp')
cond = brgys_geodata['Reg_Code']=='PH150000000'
brgys_geodata=brgys_geodata[cond].reset_index(drop=True)
brgys_geodata = brgys_geodata.to_crs("EPSG:4326")
brgys_geodata = brgys_geodata[["Bgy_Code","geometry"]]
brgys_geodata.head()

Unnamed: 0,Bgy_Code,geometry
0,PH157001001,"MULTIPOLYGON (((119.95374 5.07851, 119.95384 5..."
1,PH157001002,"POLYGON ((119.88709 5.06919, 119.88709 5.06920..."
2,PH157001003,"MULTIPOLYGON (((120.06363 5.16131, 120.06365 5..."
3,PH157001004,"POLYGON ((120.13585 5.19601, 120.13584 5.19603..."
4,PH157001005,"MULTIPOLYGON (((119.98776 4.98564, 119.98774 4..."


In [12]:
roads_geodata = gpd.read_file(GEODATA_PATH+'InfraRoadsARMMOSM2019/InfraRoadsARMMOSM2019.shp')
roads_geodata = roads_geodata[["osm_id","geometry"]]
roads_geodata.head()

Unnamed: 0,osm_id,geometry
0,210935215,"LINESTRING (122.27089 6.64083, 122.27119 6.640..."
1,210935219,"LINESTRING (122.11945 6.50060, 122.11907 6.501..."
2,210935220,"LINESTRING (122.14195 6.63450, 122.14231 6.634..."
3,210935221,"LINESTRING (122.16074 6.63023, 122.16049 6.629..."
4,210935223,"LINESTRING (122.22433 6.65494, 122.22405 6.655..."


In [13]:
schools_geodata = gpd.read_file(GEODATA_PATH+'SchoolsARMMDEPED2017/SchoolsARMMDEPED2017.shp')
schools_geodata = schools_geodata[["SCH_ID","geometry"]]
schools_geodata.head()

Unnamed: 0,SCH_ID,geometry
0,133049,POINT (121.91094 6.50770)
1,215015,POINT (122.19572 6.66413)
2,133154,POINT (122.12672 6.44361)
3,133023,POINT (121.89963 6.66000)
4,133081,POINT (121.95404 6.34201)


In [14]:
def get_number_of_roads(PSGC):
    cond = brgys_geodata["Bgy_Code"]==PSGC
    index = brgys_geodata[cond].index[0]
    road_intersects = roads_geodata["geometry"].intersects(brgys_geodata.loc[index,"geometry"])
    return sum(road_intersects)

def get_nearest_school(brgy_code):
    cond = brgys_geodata["Bgy_Code"]==brgy_code
    pt1 = brgys_geodata[cond].geometry.centroid.values[0]
    type(pt1)
    pts = schools_geodata.geometry.unary_union
    nearest = schools_geodata.geometry == nearest_points(pt1, pts)[1]
    return schools_geodata[nearest].SCH_ID.values[0]

def get_nearest_school_distance(brgy_code,school_code):
    cond = brgys_geodata["Bgy_Code"]==brgy_code
    pt1 = brgys_geodata[cond].geometry.centroid.values[0]
    cond = schools_geodata["SCH_ID"]==school_code
    pt2 = schools_geodata.loc[cond,'geometry'].values[0]
    return pt1.distance(pt2)

In [15]:
education_sdg_df["number_of_roads"] = education_sdg_df["PSGC_BRGY"].apply(get_number_of_roads)
education_sdg_df['Nearest_School'] = education_sdg_df.apply(lambda row: get_nearest_school(row['PSGC_BRGY']), axis=1)
education_sdg_df['Nearest_School_Distance'] = education_sdg_df.apply(lambda row: get_nearest_school_distance(row['PSGC_BRGY'], row['Nearest_School']), axis=1)

print(education_sdg_df.shape)
print(education_sdg_df.isna().sum())
education_sdg_df.head().T

(2482, 12)
PSGC_BRGY                                   0
both_student_age_7_to_25                    0
male_student_age_7_to_25                    0
female_student_age_7_to_25                  0
No_of_Schools                             914
Perc_Households_Electricity_Light           0
Perc_Households_Kerosene_Gass_Light         0
Perc_Households_Drinking_Owned_Faucet       0
Perc_Households_Drinking_Shared_Faucet      0
number_of_roads                             0
Nearest_School                              0
Nearest_School_Distance                     0
dtype: int64


Unnamed: 0,0,1,2,3,4
PSGC_BRGY,PH150702001,PH150702002,PH150702004,PH150702005,PH150702006
both_student_age_7_to_25,250,497,237,953,1277
male_student_age_7_to_25,134,247,112,489,700
female_student_age_7_to_25,116,250,125,464,577
No_of_Schools,1,1,,1,1
Perc_Households_Electricity_Light,0.65625,0.792208,0.39604,0.870968,0.71978
Perc_Households_Kerosene_Gass_Light,0.34375,0.207792,0.594059,0.126728,0.236264
Perc_Households_Drinking_Owned_Faucet,0.05,0.0519481,0.019802,0.0990783,0.0018315
Perc_Households_Drinking_Shared_Faucet,0.41875,0.00865801,0.138614,0.730415,0.0128205
number_of_roads,0,1,3,5,4
