In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.ops import nearest_points

import warnings
warnings.filterwarnings("ignore")

import os
GEODATA_PATH = 'data/geodata/'
TABULAR_PATH = 'data/tabular/'
data_files = os.listdir(TABULAR_PATH)

In [2]:
def get_sex_age_cols(sex="bot",min_age=7,max_age=25):
    sex_single_age_cols = [col for col in population_df.columns if (col.replace('und_','').replace('and_ove_','').split("_")[-2:][0]==sex)]
    student_age_cols = [col for col in sex_single_age_cols if (int(col.split("_")[-1])>=min_age) \
                        & (int(col.split("_")[-1])<=max_age)]
    return student_age_cols

def get_number_of_roads(PSGC):
    cond = brgys_geodata["Bgy_Code"]==PSGC
    index = brgys_geodata[cond].index[0]
    road_intersects = roads_geodata["geometry"].intersects(brgys_geodata.loc[index,"geometry"])
    return sum(road_intersects)

def get_nearest_school(brgy_code):
    cond = brgys_geodata["Bgy_Code"]==brgy_code
    pt1 = brgys_geodata[cond].geometry.centroid.values[0]
    type(pt1)
    pts = schools_geodata.geometry.unary_union
    nearest = schools_geodata.geometry == nearest_points(pt1, pts)[1]
    return schools_geodata[nearest].SCH_ID.values[0]

def get_nearest_school_distance(brgy_code,school_code):
    cond = brgys_geodata["Bgy_Code"]==brgy_code
    pt1 = brgys_geodata[cond].geometry.centroid.values[0]
    cond = schools_geodata["SCH_ID"]==school_code
    pt2 = schools_geodata.loc[cond,'geometry'].values[0]
    return pt1.distance(pt2)

In [3]:
population_df = pd.read_excel(TABULAR_PATH+'Population 2010, 2015 ARMM,BCT.xlsx')
cond = population_df['PSGC_REGI']=='PH150000000'
population_df=population_df[cond].reset_index(drop=True)
print("population_df",population_df.shape)

population_age_groups_df = population_df[["PSGC_BRGY"]]

age_range=5
for age in range(0,81,age_range):
    min_age = age
    max_age = age+age_range-1
    
    both_age_cols = get_sex_age_cols(sex="bot",min_age=min_age,max_age=max_age)
    male_age_cols = get_sex_age_cols(sex="mal",min_age=min_age,max_age=max_age)
    female_age_cols = get_sex_age_cols(sex="fem",min_age=min_age,max_age=max_age)
    
    both_col = 'both_age_{}_to_{}'.format(min_age,max_age)
    male_col = 'male_age_{}_to_{}'.format(min_age,max_age)
    female_col = 'female_age_{}_to_{}'.format(min_age,max_age)
    
    age_groups = population_df[both_age_cols+male_age_cols+female_age_cols]
    age_groups[both_col] = age_groups.loc[:,both_age_cols].sum(axis=1)
    age_groups[male_col] = age_groups.loc[:,male_age_cols].sum(axis=1)
    age_groups[female_col] = age_groups.loc[:,female_age_cols].sum(axis=1)

    age_groups.drop(columns=both_age_cols+male_age_cols+female_age_cols,inplace=True)
    
    population_age_groups_df = pd.concat([population_age_groups_df,age_groups],axis=1)
    del age_groups

print("population_age_groups_df",population_age_groups_df.shape)

schools_df = pd.read_excel(TABULAR_PATH+'Number of schools per brgy.xlsx',usecols=["PSGC_BRGY","Number of School"])
schools_df.rename(columns={'Number of School':'No_of_Schools'},inplace=True)
print("schools_df",schools_df.shape)

schools_meta_df = pd.read_excel(TABULAR_PATH+'SchoolsDepEd2017.xlsx',usecols=["PSGC_BRGY","SCH_CAT","SCH_TYPE"])
schools_meta_df = pd.get_dummies(schools_meta_df, columns=["SCH_TYPE","SCH_CAT"])
schools_meta_df = schools_meta_df.groupby("PSGC_BRGY",as_index=False).sum()
print('schools_meta_df',schools_meta_df.shape)

education_df = pd.merge(population_age_groups_df,schools_df,on="PSGC_BRGY",how="left")
education_df = pd.merge(education_df,schools_meta_df,on="PSGC_BRGY",how="left")
print("education_df",education_df.shape)

source_light_df = pd.read_excel(TABULAR_PATH+'Number of Households by Kind of Fuel for Lighting (ARMM 2015) Brgy.xlsx')
source_light_df["Perc_Households_Electricity_Light"] = \
            source_light_df["Electricity"]/source_light_df["Number of HH"]
source_light_df["Perc_Households_Kerosene_Gass_Light"] = \
            source_light_df["Kerosene (Gaas)"]/source_light_df["Number of HH"]
source_light_df = source_light_df[["PSGC","Perc_Households_Electricity_Light","Perc_Households_Kerosene_Gass_Light"]]
print("source_light_df",source_light_df.shape)

source_drinking_water_df = pd.read_excel(TABULAR_PATH+'Main Source of Water Supply for Drinking (ARMM 2015) Brgy.xlsx')
source_drinking_water_df["Perc_Households_Drinking_Owned_Faucet"] = \
            source_drinking_water_df["Own use faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df["Perc_Households_Drinking_Shared_Faucet"] = \
            source_drinking_water_df["Shared faucet community water system"]/source_drinking_water_df["Number of Households"]
source_drinking_water_df = source_drinking_water_df[["PSGC","Perc_Households_Drinking_Owned_Faucet","Perc_Households_Drinking_Shared_Faucet"]]
print("source_drinking_water_df",source_drinking_water_df.shape)

sdg_df = pd.merge(source_light_df,source_drinking_water_df,on="PSGC",how="left")
print("sdg_df",sdg_df.shape)

education_sdg_df = pd.merge(education_df,sdg_df,left_on="PSGC_BRGY",right_on="PSGC",how="left")
education_sdg_df.drop(columns="PSGC",inplace=True)
print("education_sdg_df",education_sdg_df.shape)

population_df (2482, 288)
population_age_groups_df (2482, 52)
schools_df (1570, 2)
schools_meta_df (1570, 16)
education_df (2482, 68)
source_light_df (2490, 3)
source_drinking_water_df (2490, 3)
sdg_df (2490, 5)
education_sdg_df (2482, 72)


In [4]:
brgys_geodata = gpd.read_file(GEODATA_PATH+'AdministrativeBoundariesBARMMBarangays20190206PSA2016/AdministrativeBoundariesBARMMBarangays20190206PSA2016.shp')
cond = brgys_geodata['Reg_Code']=='PH150000000'
brgys_geodata=brgys_geodata[cond].reset_index(drop=True)
brgys_geodata = brgys_geodata.to_crs("EPSG:4326")
brgys_geodata = brgys_geodata[["Bgy_Code","geometry"]]
print("brgys_geodata",brgys_geodata.shape)

roads_geodata = gpd.read_file(GEODATA_PATH+'InfraRoadsARMMOSM2019/InfraRoadsARMMOSM2019.shp')
roads_geodata = roads_geodata[["osm_id","geometry"]]
print("roads_geodata",roads_geodata.shape)

schools_geodata = gpd.read_file(GEODATA_PATH+'SchoolsARMMDEPED2017/SchoolsARMMDEPED2017.shp')
schools_geodata = schools_geodata[["SCH_ID","geometry"]]
print("schools_geodata",schools_geodata.shape)

education_sdg_df["number_of_roads"] = education_sdg_df["PSGC_BRGY"].apply(get_number_of_roads)
education_sdg_df['Nearest_School'] = education_sdg_df.apply(lambda row: get_nearest_school(row['PSGC_BRGY']), axis=1)
education_sdg_df['Nearest_School_Distance'] = education_sdg_df.apply(lambda row: get_nearest_school_distance(row['PSGC_BRGY'], row['Nearest_School']), axis=1)

print("education_sdg_df + geoinfo",education_sdg_df.shape)

brgys_geodata (2489, 2)
roads_geodata (6573, 2)
schools_geodata (2711, 2)
education_sdg_df + geoinfo (2482, 75)


In [17]:
with pd.option_context("display.max_rows",None):
    display(education_sdg_df.head().T)

Unnamed: 0,0,1,2,3,4
PSGC_BRGY,PH150702001,PH150702002,PH150702004,PH150702005,PH150702006
both_age_0_to_4,88,210,57,277,352
male_age_0_to_4,54,100,27,152,146
female_age_0_to_4,34,110,30,125,206
both_age_5_to_9,79,161,62,305,354
male_age_5_to_9,35,77,40,159,201
female_age_5_to_9,44,84,22,146,153
both_age_10_to_14,73,133,80,269,398
male_age_10_to_14,42,69,39,155,217
female_age_10_to_14,31,64,41,114,181


In [6]:
education_sdg_df.to_csv('./data/output/Population_Schools_SDGs_Clean.csv')