In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.ops import nearest_points

import warnings
warnings.filterwarnings("ignore")

import os
GEODATA_PATH = 'data/geodata/'
TABULAR_PATH = 'data/tabular/'
OUTPUT_PATH = 'data/output/'
data_files = os.listdir(TABULAR_PATH)

Helper Functions

In [2]:
def get_sex_age_cols(sex="bot",min_age=7,max_age=25):
    sex_single_age_cols = [col for col in population_df.columns if (col.replace('und_','').replace('and_ove_','').split("_")[-2:][0]==sex)]
    student_age_cols = [col for col in sex_single_age_cols if (int(col.split("_")[-1])>=min_age) \
                        & (int(col.split("_")[-1])<=max_age)]
    return student_age_cols

Schools data

In [3]:
population_df = pd.read_excel(TABULAR_PATH+'Population 2010, 2015 ARMM,BCT.xlsx')
cond = population_df['PSGC_REGI']=='PH150000000'
population_df=population_df[cond].reset_index(drop=True)
print("population_df",population_df.shape)

schools_meta_df = pd.read_excel(TABULAR_PATH+'SchoolsDepEd2017.xlsx',
                                usecols=['SCH_ID', 'SCH_NAME', 'SCH_CAT', 'SCH_TYPE', 
                                         'LAT', 'LONG', 'BARANGAY','MUNNAME', 'PROV'])
schools_meta_df["SCH_TYPE"] = schools_meta_df["SCH_TYPE"].str.upper()
print("schools_meta_df",schools_meta_df.shape)

population_df['brgy_muni_prov_key']=population_df["BrgyName"].str.replace("Pagatin \(Pagatin I\)","Pagatin I")\
                                                            .str.split(" \(").str[0].str.strip()\
                                    +", "+population_df["MuniCities"].str.split(" \(").str[0].str.strip()\
                                    +", "+population_df["PROV"].str.split(" \(").str[0].str.strip()
population_df['brgy_muni_prov_key']=population_df['brgy_muni_prov_key'].str.lower().str.strip()

schools_meta_df['brgy_muni_prov_key'] = schools_meta_df["BARANGAY"].str.split(" \(").str[0].str.strip()\
                                        +", "+schools_meta_df["MUNNAME"].str.split(" \(").str[0].str.strip()\
                                        +", "+schools_meta_df["PROV"].str.split(" \(").str[0].str.strip()
schools_meta_df['brgy_muni_prov_key'] = schools_meta_df['brgy_muni_prov_key'].str.lower()\
                                        .str.replace("city of marawi","marawi city")\
                                        .str.replace("lamitan city","city of lamitan")
schools_meta_update = pd.merge(schools_meta_df,population_df[["PSGC_PROV","PSGC_CITY/MUNI","PSGC_BRGY","brgy_muni_prov_key"]],
                               on="brgy_muni_prov_key",how="left")
schools_meta_update.drop(columns="brgy_muni_prov_key",inplace=True)
print("schools_meta_update",schools_meta_update.shape)

schools_meta_update.to_csv(OUTPUT_PATH+"SchoolsDepEd2017_Clean.csv",index=False)

missing_count = len(set(schools_meta_df['brgy_muni_prov_key'])-set(population_df['brgy_muni_prov_key']))
print("\nmissing in brgy_muni_prov_key: ",missing_count)
del population_df, schools_meta_df, schools_meta_update

population_df (2482, 288)
schools_meta_df (2711, 9)
schools_meta_update (2711, 12)

missing in brgy_muni_prov_key:  27


In [4]:
schools_clean = pd.read_csv(OUTPUT_PATH+"SchoolsDepEd2017_Clean.csv",usecols=["SCH_ID","SCH_NAME","SCH_TYPE","SCH_CAT","PSGC_BRGY"])
cond = (schools_clean["SCH_TYPE"]=="PUBLIC") & (schools_clean["SCH_CAT"].str.contains("ES|PS"))
school_public_elem = schools_clean[cond].reset_index(drop=True)
school_public_elem = school_public_elem.drop(columns=["SCH_TYPE","SCH_CAT"])
print("school_public_elem",school_public_elem.shape)

schools_elementary_enrollment = pd.read_csv(TABULAR_PATH+"deped_publicelementaryenrollment2015.csv",
                                           encoding='latin',
                                           usecols=["school_id","region","school_name","year_level","gender","enrollment"])
cond = schools_elementary_enrollment["region"]=="ARMM - Autonomous Region in Muslim Mindanao"
schools_elementary_enrollment = schools_elementary_enrollment[cond].reset_index(drop=True)
print("schools_ementary_enrollment",schools_elementary_enrollment.shape,
      "unique schools:", len(schools_elementary_enrollment["school_id"].unique()))

schools_elementary_enrollment = pd.pivot_table(schools_elementary_enrollment,index=["school_id"],
                                              columns=["gender","year_level"],values="enrollment")
schools_elementary_enrollment = schools_elementary_enrollment.reset_index()

new_cols = [gender+'_'+year_level for gender,year_level in schools_elementary_enrollment.columns.to_flat_index()]
schools_elementary_enrollment.columns = ["school_id"]+new_cols[1:]

schools_elementary_enrollment = pd.merge(schools_elementary_enrollment,school_public_elem,left_on="school_id",right_on="SCH_ID",how="left")
schools_elementary_enrollment.drop(columns="SCH_ID",inplace=True)

print("schools_elementary_enrollment",schools_elementary_enrollment.shape)

schools_elementary_enrollment.to_csv(OUTPUT_PATH+"public_elementary_enrollment2015_Clean.csv",index=False)

print("\nmissing school id:",list(schools_elementary_enrollment.loc[schools_elementary_enrollment["SCH_NAME"].isna(),"school_id"].unique()))

school_public_elem (2101, 3)
schools_ementary_enrollment (6036, 6) unique schools: 503
schools_elementary_enrollment (503, 15)

missing school id: [133004]


In [5]:
cond = (schools_clean["SCH_TYPE"]=="PUBLIC") & (schools_clean["SCH_CAT"].str.contains("HS|PS"))
school_public_secondary = schools_clean[cond].reset_index(drop=True)
school_public_secondary = school_public_secondary.drop(columns=["SCH_TYPE","SCH_CAT"])
print("school_public_secondary",school_public_secondary.shape)

schools_secondary_enrollment = pd.read_csv(TABULAR_PATH+"deped_publicsecondaryenrollment2015.csv",
                                           encoding='latin',
                                           usecols=["school_id","region","school_name","year_level","gender","enrollment"])
cond = schools_secondary_enrollment["region"]=="ARMM - Autonomous Region in Muslim Mindanao"
schools_secondary_enrollment = schools_secondary_enrollment[cond].reset_index(drop=True)
print("schools_secondary_enrollment",schools_secondary_enrollment.shape,
      "unique schools:", len(schools_secondary_enrollment["school_id"].unique()))

schools_secondary_enrollment = pd.pivot_table(schools_secondary_enrollment,index=["school_id"],
                                              columns=["gender","year_level"],values="enrollment")
schools_secondary_enrollment = schools_secondary_enrollment.reset_index()

new_cols = [gender+'_'+year_level for gender,year_level in schools_secondary_enrollment.columns.to_flat_index()]
schools_secondary_enrollment.columns = ["school_id"]+new_cols[1:]

schools_secondary_enrollment = pd.merge(schools_secondary_enrollment,school_public_secondary,left_on="school_id",right_on="SCH_ID",how="left")
schools_secondary_enrollment.drop(columns="SCH_ID",inplace=True)

print("schools_secondary_enrollment",schools_secondary_enrollment.shape)

schools_secondary_enrollment.to_csv(OUTPUT_PATH+"public_secondary_enrollment2015_Clean.csv",index=False)

print("\nmissing school id:",list(schools_secondary_enrollment.loc[schools_secondary_enrollment["SCH_NAME"].isna(),"school_id"].unique()))

school_public_secondary (956, 3)
schools_secondary_enrollment (2432, 6) unique schools: 304
schools_secondary_enrollment (304, 11)

missing school id: [326502]


GeoData

In [10]:
brgys_geodata = gpd.read_file(GEODATA_PATH+'AdministrativeBoundariesBARMMBarangays20190206PSA2016/AdministrativeBoundariesBARMMBarangays20190206PSA2016.shp')
cond = brgys_geodata['Reg_Code']=='PH150000000'
brgys_geodata=brgys_geodata[cond].reset_index(drop=True)
brgys_geodata = brgys_geodata.to_crs("EPSG:4326")
brgys_geodata = brgys_geodata[["Bgy_Code","geometry"]]
print("brgys_geodata",brgys_geodata.shape)

roads_geodata = gpd.read_file(GEODATA_PATH+'InfraRoadsARMMOSM2019/InfraRoadsARMMOSM2019.shp')
roads_geodata = roads_geodata[["osm_id","geometry"]]
print("roads_geodata",roads_geodata.shape)

schools_geodata = gpd.read_file(GEODATA_PATH+'SchoolsARMMDEPED2017/SchoolsARMMDEPED2017.shp')
schools_meta_update = pd.read_csv(OUTPUT_PATH+"SchoolsDepEd2017_Clean.csv")
schools_geodata_update = gpd.GeoDataFrame(schools_meta_update, geometry = schools_geodata["geometry"])
schools_geodata_update.to_file(OUTPUT_PATH+"SchoolsDepEd2017_Geodata_Clean/SchoolsDepEd2017_Geodata_Clean.shp")
print("schools_geodata_update",schools_geodata_update.shape)
schools_geodata = schools_geodata[["SCH_ID","geometry"]]
print("schools_geodata",schools_geodata.shape)

brgys_geodata (2489, 2)
roads_geodata (6573, 2)
schools_geodata_update (2711, 13)
schools_geodata (2711, 2)


In [16]:
roads_geodata = gpd.read_file(GEODATA_PATH+'InfraRoadsARMMOSM2019/InfraRoadsARMMOSM2019.shp')
cond = roads_geodata['fclass'].str.contains("trunk|primary|secondary|tertiary")
roads_geodata = roads_geodata[cond].reset_index(drop=True)
roads_geodata.head()

Unnamed: 0,osm_id,code,fclass,name,ref,oneway,maxspeed,layer,bridge,tunnel,geometry
0,210935215,5113,primary,,,B,0,0,F,F,"LINESTRING (122.27089 6.64083, 122.27119 6.640..."
1,210935219,5113,primary,,,B,0,0,F,F,"LINESTRING (122.11945 6.50060, 122.11907 6.501..."
2,301620414,5115,tertiary,,,B,0,0,F,F,"LINESTRING (122.14038 6.66005, 122.14279 6.661..."
3,103130612,5113,primary,,,B,0,0,F,F,"LINESTRING (122.12169 6.49992, 122.12176 6.499..."
4,301619117,5113,primary,,,B,0,0,F,F,"LINESTRING (122.11945 6.50060, 122.12023 6.500..."


In [36]:
intersection  = brgys_geodata['geometry'].intersection(roads_geodata.loc[0,'geometry'])
                                )
intersection['check'].unique()

<GeometryArray>
[          <shapely.geometry.linestring.LineString object at 0x000001DE97346320>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346710>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346668>,
           <shapely.geometry.linestring.LineString object at 0x000001DE97346A90>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346780>,
           <shapely.geometry.linestring.LineString object at 0x000001DE97346E48>,
           <shapely.geometry.linestring.LineString object at 0x000001DE973462B0>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346F98>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346F60>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346DD8>,
 <shapely.geometry.multilinestring.MultiLineString object at 0x000001DE97346C18>,
           <shapely.geometry.linestring.LineString object at 0x000001DE97346D30>]


geopandas.geoseries.GeoSeries

Population Data

In [7]:
population_df = pd.read_excel(TABULAR_PATH+'Population 2010, 2015 ARMM,BCT.xlsx')
cond = population_df['PSGC_REGI']=='PH150000000'
population_df=population_df[cond].reset_index(drop=True)
print("population_df",population_df.shape)

population_age_groups_df = population_df[["PSGC_BRGY"]]

custom_age_ranges = [(7,12),(13,16),(17,18),(19,22)]

age_range=5
popn_age_ranges = [(age,age+age_range-1) for age in range(0,81,age_range)] + custom_age_ranges

for min_age,max_age in popn_age_ranges:

    both_age_cols = get_sex_age_cols(sex="bot",min_age=min_age,max_age=max_age)
    male_age_cols = get_sex_age_cols(sex="mal",min_age=min_age,max_age=max_age)
    female_age_cols = get_sex_age_cols(sex="fem",min_age=min_age,max_age=max_age)
    
    both_col = 'both_age_{}_to_{}'.format(min_age,max_age)
    male_col = 'male_age_{}_to_{}'.format(min_age,max_age)
    female_col = 'female_age_{}_to_{}'.format(min_age,max_age)
    
    age_groups = population_df[both_age_cols+male_age_cols+female_age_cols]
    age_groups[both_col] = age_groups.loc[:,both_age_cols].sum(axis=1)
    age_groups[male_col] = age_groups.loc[:,male_age_cols].sum(axis=1)
    age_groups[female_col] = age_groups.loc[:,female_age_cols].sum(axis=1)

    age_groups.drop(columns=both_age_cols+male_age_cols+female_age_cols,inplace=True)
    
    population_age_groups_df = pd.concat([population_age_groups_df,age_groups],axis=1)
    del age_groups

print("population_age_groups_df",population_age_groups_df.shape)

schools_df = pd.read_excel(TABULAR_PATH+'Number of schools per brgy.xlsx',usecols=["PSGC_BRGY","Number of School"])
schools_df.rename(columns={'Number of School':'No_of_Schools'},inplace=True)
print("schools_df",schools_df.shape)

schools_meta_df = pd.read_csv(OUTPUT_PATH+'SchoolsDepEd2017_Clean.csv',usecols=["PSGC_BRGY","SCH_CAT","SCH_TYPE"])
schools_meta_df["SCH_TYPE"] = schools_meta_df["SCH_TYPE"].str.strip().str.upper()
schools_meta_df = pd.get_dummies(schools_meta_df, columns=["SCH_TYPE","SCH_CAT"])
schools_meta_df = schools_meta_df.groupby("PSGC_BRGY",as_index=False).sum()
print('schools_meta_df',schools_meta_df.shape)

education_df = pd.merge(population_age_groups_df,schools_df,on="PSGC_BRGY",how="left")
education_df = pd.merge(education_df,schools_meta_df,on="PSGC_BRGY",how="left")
print("education_df",education_df.shape)

source_light_df = pd.read_excel(TABULAR_PATH+'Number of Households by Kind of Fuel for Lighting (ARMM 2015) Brgy.xlsx')
source_light_df["SDG_Households_Electricity_Light_Pct"] = \
            source_light_df["Electricity"]/source_light_df["Number of HH"]
source_light_df["SDG_Households_Kerosene_Gass_Light_Pct"] = \
            source_light_df["Kerosene (Gaas)"]/source_light_df["Number of HH"]
source_light_df = source_light_df[["PSGC","SDG_Households_Electricity_Light_Pct","SDG_Households_Kerosene_Gass_Light_Pct"]]
print("source_light_df",source_light_df.shape)

source_drinking_water_df = pd.read_excel(TABULAR_PATH+'Main Source of Water Supply for Drinking (ARMM 2015) Brgy.xlsx')
source_drinking_water_df["SDG_Households_Drinking_Owned_Faucet_Pct"] = \
            source_drinking_water_df["Own use faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df["SDG_Households_Drinking_Shared_Faucet_Pct"] = \
            source_drinking_water_df["Shared faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df = source_drinking_water_df[["PSGC","SDG_Households_Drinking_Owned_Faucet_Pct","SDG_Households_Drinking_Shared_Faucet_Pct"]]
print("source_drinking_water_df",source_drinking_water_df.shape)

sdg_df = pd.merge(source_light_df,source_drinking_water_df,on="PSGC",how="left")
print("sdg_df",sdg_df.shape)

education_sdg_df = pd.merge(education_df,sdg_df,left_on="PSGC_BRGY",right_on="PSGC",how="left")
education_sdg_df.drop(columns="PSGC",inplace=True)
print("education_sdg_df",education_sdg_df.shape)

population_df (2482, 288)
population_age_groups_df (2482, 64)
schools_df (1570, 2)
schools_meta_df (1745, 14)
education_df (2482, 78)
source_light_df (2490, 3)
source_drinking_water_df (2490, 3)
sdg_df (2490, 5)
education_sdg_df (2482, 82)


In [8]:
print("education_sdg_df",education_sdg_df.shape)
education_sdg_df.to_csv('./data/output/Population_Schools_SDGs_Clean.csv')

with pd.option_context("display.max_rows",None):
    display(education_sdg_df.head(6).T)

education_sdg_df (2482, 82)


Unnamed: 0,0,1,2,3,4,5
PSGC_BRGY,PH150702001,PH150702002,PH150702004,PH150702005,PH150702006,PH150702007
both_age_0_to_4,88,210,57,277,352,270
male_age_0_to_4,54,100,27,152,146,140
female_age_0_to_4,34,110,30,125,206,130
both_age_5_to_9,79,161,62,305,354,236
male_age_5_to_9,35,77,40,159,201,114
female_age_5_to_9,44,84,22,146,153,122
both_age_10_to_14,73,133,80,269,398,220
male_age_10_to_14,42,69,39,155,217,121
female_age_10_to_14,31,64,41,114,181,99
