In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.ops import nearest_points

import warnings
warnings.filterwarnings("ignore")

import os
GEODATA_PATH = 'data/geodata/'
TABULAR_PATH = 'data/tabular/'
data_files = os.listdir(TABULAR_PATH)

In [2]:
def get_sex_age_cols(sex="bot",min_age=7,max_age=25):
    sex_single_age_cols = [col for col in population_df.columns if (col.split("_")[-2:][0]==sex)]
    student_age_cols = [col for col in sex_single_age_cols if (int(col.split("_")[-1])>=min_age) \
                        & (int(col.split("_")[-1])<=max_age)]
    return student_age_cols

def get_number_of_roads(PSGC):
    cond = brgys_geodata["Bgy_Code"]==PSGC
    index = brgys_geodata[cond].index[0]
    road_intersects = roads_geodata["geometry"].intersects(brgys_geodata.loc[index,"geometry"])
    return sum(road_intersects)

def get_nearest_school(brgy_code):
    cond = brgys_geodata["Bgy_Code"]==brgy_code
    pt1 = brgys_geodata[cond].geometry.centroid.values[0]
    type(pt1)
    pts = schools_geodata.geometry.unary_union
    nearest = schools_geodata.geometry == nearest_points(pt1, pts)[1]
    return schools_geodata[nearest].SCH_ID.values[0]

def get_nearest_school_distance(brgy_code,school_code):
    cond = brgys_geodata["Bgy_Code"]==brgy_code
    pt1 = brgys_geodata[cond].geometry.centroid.values[0]
    cond = schools_geodata["SCH_ID"]==school_code
    pt2 = schools_geodata.loc[cond,'geometry'].values[0]
    return pt1.distance(pt2)

In [3]:
population_df = pd.read_excel(TABULAR_PATH+'Population 2010, 2015 ARMM,BCT.xlsx')
cond = population_df['PSGC_REGI']=='PH150000000'
population_df=population_df[cond].reset_index(drop=True)

print("population_df",population_df.shape)

both_student_age_cols = get_sex_age_cols()
male_student_age_cols = get_sex_age_cols(sex="mal")
female_student_age_cols = get_sex_age_cols(sex="fem")

student_population_df = population_df[["PSGC_BRGY"]+both_student_age_cols+male_student_age_cols+female_student_age_cols]
student_population_df["both_student_age_7_to_25"] = student_population_df.loc[:,both_student_age_cols].sum(axis=1)
student_population_df["male_student_age_7_to_25"] = student_population_df.loc[:,male_student_age_cols].sum(axis=1)
student_population_df["female_student_age_7_to_25"] = student_population_df.loc[:,female_student_age_cols].sum(axis=1)

student_population_df.drop(columns=both_student_age_cols+male_student_age_cols+female_student_age_cols,inplace=True)
print("student_population_df",student_population_df.shape)

schools_df = pd.read_excel(TABULAR_PATH+'Number of schools per brgy.xlsx',usecols=["PSGC_BRGY","Number of School"])
schools_df.rename(columns={'Number of School':'No_of_Schools'},inplace=True)
print("schools_df",schools_df.shape)

education_df = pd.merge(student_population_df,schools_df,on="PSGC_BRGY",how="left")
print("education_df",education_df.shape)

source_light_df = pd.read_excel(TABULAR_PATH+'Number of Households by Kind of Fuel for Lighting (ARMM 2015) Brgy.xlsx')
source_light_df["Perc_Households_Electricity_Light"] = \
            source_light_df["Electricity"]/source_light_df["Number of HH"]
source_light_df["Perc_Households_Kerosene_Gass_Light"] = \
            source_light_df["Kerosene (Gaas)"]/source_light_df["Number of HH"]
source_light_df = source_light_df[["PSGC","Perc_Households_Electricity_Light","Perc_Households_Kerosene_Gass_Light"]]
print("source_light_df",source_light_df.shape)

source_drinking_water_df = pd.read_excel(TABULAR_PATH+'Main Source of Water Supply for Drinking (ARMM 2015) Brgy.xlsx')
source_drinking_water_df["Perc_Households_Drinking_Owned_Faucet"] = \
            source_drinking_water_df["Own use faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df["Perc_Households_Drinking_Shared_Faucet"] = \
            source_drinking_water_df["Shared faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df = source_drinking_water_df[["PSGC","Perc_Households_Drinking_Owned_Faucet","Perc_Households_Drinking_Shared_Faucet"]]
print("source_drinking_water_df",source_drinking_water_df.shape)

sdg_df = pd.merge(source_light_df,source_drinking_water_df,on="PSGC",how="left")
print("sdg_df",sdg_df.shape)

education_sdg_df = pd.merge(education_df,sdg_df,left_on="PSGC_BRGY",right_on="PSGC",how="left")
education_sdg_df.drop(columns="PSGC",inplace=True)
print("education_sdg_df",education_sdg_df.shape)

population_df (2482, 288)
student_population_df (2482, 4)
schools_df (1570, 2)
education_df (2482, 5)
source_light_df (2490, 3)
source_drinking_water_df (2490, 3)
sdg_df (2490, 5)
education_sdg_df (2482, 9)


In [4]:
brgys_geodata = gpd.read_file(GEODATA_PATH+'AdministrativeBoundariesBARMMBarangays20190206PSA2016/AdministrativeBoundariesBARMMBarangays20190206PSA2016.shp')
cond = brgys_geodata['Reg_Code']=='PH150000000'
brgys_geodata=brgys_geodata[cond].reset_index(drop=True)
brgys_geodata = brgys_geodata.to_crs("EPSG:4326")
brgys_geodata = brgys_geodata[["Bgy_Code","geometry"]]
print("brgys_geodata",brgys_geodata.shape)

roads_geodata = gpd.read_file(GEODATA_PATH+'InfraRoadsARMMOSM2019/InfraRoadsARMMOSM2019.shp')
roads_geodata = roads_geodata[["osm_id","geometry"]]
print("roads_geodata",roads_geodata.shape)

schools_geodata = gpd.read_file(GEODATA_PATH+'SchoolsARMMDEPED2017/SchoolsARMMDEPED2017.shp')
schools_geodata = schools_geodata[["SCH_ID","geometry"]]
print("schools_geodata",schools_geodata.shape)

education_sdg_df["number_of_roads"] = education_sdg_df["PSGC_BRGY"].apply(get_number_of_roads)
education_sdg_df['Nearest_School'] = education_sdg_df.apply(lambda row: get_nearest_school(row['PSGC_BRGY']), axis=1)
education_sdg_df['Nearest_School_Distance'] = education_sdg_df.apply(lambda row: get_nearest_school_distance(row['PSGC_BRGY'], row['Nearest_School']), axis=1)

print("education_sdg_df + geoinfo",education_sdg_df.shape)

brgys_geodata (2489, 2)
roads_geodata (6573, 2)
schools_geodata (2711, 2)
education_sdg_df + geoinfo (2482, 12)


In [5]:
education_sdg_df.head().T

Unnamed: 0,0,1,2,3,4
PSGC_BRGY,PH150702001,PH150702002,PH150702004,PH150702005,PH150702006
both_student_age_7_to_25,250,497,237,953,1277
male_student_age_7_to_25,134,247,112,489,700
female_student_age_7_to_25,116,250,125,464,577
No_of_Schools,1,1,,1,1
Perc_Households_Electricity_Light,0.65625,0.792208,0.39604,0.870968,0.71978
Perc_Households_Kerosene_Gass_Light,0.34375,0.207792,0.594059,0.126728,0.236264
Perc_Households_Drinking_Owned_Faucet,0.05,0.0519481,0.019802,0.0990783,0.0018315
Perc_Households_Drinking_Shared_Faucet,0.41875,0.00865801,0.138614,0.730415,0.0128205
number_of_roads,0,1,3,5,4


In [6]:
education_sdg_df.to_csv('./data/output/Education_SDGs_Clean.csv')