In [1]:
import json
from typing import Dict, Set

import pandas as pd
import yaml
from IPython.display import display
from rapidfuzz import fuzz

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

psg_directory = "../resources/"
psg_data_file = "psgc_2025-08-07.csv"

In [2]:
df = pd.read_csv(psg_directory + psg_data_file)
display(df.info())
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43769 entries, 0 to 43768
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   psgc_id                43769 non-null  int64  
 1   name                   43769 non-null  object 
 2   correspondence_code    43719 non-null  float64
 3   geographic_level       43767 non-null  object 
 4   old_names              1699 non-null   object 
 5   city_class             149 non-null    object 
 6   income_classification  1724 non-null   object 
 7   settlement_type        42011 non-null  object 
 8   population             43762 non-null  object 
 9   Unnamed: 9             75 non-null     object 
 10  barangay_status        2855 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status
0,1300000000,National Capital Region (NCR),130000000.0,Reg,,,,,13484462,,
1,1380100000,City of Caloocan,137501000.0,City,,HUC,1st,,1661584,,
2,1380100001,Barangay 1,137501001.0,Bgy,,,,U,2319,,
3,1380100002,Barangay 2,137501002.0,Bgy,,,,U,5156,,
4,1380100003,Barangay 3,137501003.0,Bgy,,,,U,2497,,
...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,124712037.0,Bgy,,,,U,7632,,
43765,1999908007,Pamalian,124712062.0,Bgy,,,,R,3256,,
43766,1999908008,Tapodoc,124717017.0,Bgy,,,,R,1767,,
43767,1999908009,Macabual,124712034.0,Bgy,,,,R,4557,,


In [3]:
df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

geographic_level_map = {
    "Reg":"region",
    "City":"city",
    "Mun":"municipality",
    "Prov":"province",
    "SubMun":"submunicipality", 
    "Bgy":"barangay"
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipal_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_huc_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipal_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_huc_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]

df.sample(10)

regions_filter = (
    (df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)


province_or_huc_filter = (
    ~(df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_huc_mapper", "name"]]
    .sort_values("province_or_huc_mapper")
    .set_index("province_or_huc_mapper")
    .to_dict()["name"]
)
municipal_or_city_filter = (
    ~(df["province_or_huc_code"] == "000")
    & ~(df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipal_or_city_mapper", "name"]]
    .sort_values("municipal_or_city_mapper")
    .set_index("municipal_or_city_mapper")
    .to_dict()["name"]
)

df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_huc"] = df["province_or_huc_mapper"].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipal_or_city_mapper"].map(municipal_or_city_mapper)
display(df.sample(10))

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper,region,province_or_huc,municipality_or_city
12301,401006031,San Andres Proper,41006031.0,barangay,,,,R,2502,,,31,6,10,4,401006031,401006,4010,4,Region IV-A (CALABARZON),Batangas,Bauan
25644,1804609013,Mag-aso,74609013.0,barangay,,,,R,887,,,13,9,46,18,1804609013,1804609,18046,18,Negros Island Region (NIR),Negros Oriental,Dauin
511,1380601229,Barangay 228,133901229.0,barangay,,,,U,3010,,,229,1,806,13,1380601229,1380601,13806,13,National Capital Region (NCR),City of Manila,Tondo I/II
32682,806410012,Santo Rosario,86410012.0,barangay,,,,R,894,,,12,10,64,8,806410012,806410,8064,8,Region VIII (Eastern Visayas),Southern Leyte,Padre Burgos
15054,405617013,Pagsangahan,45617013.0,barangay,,,,R,1733,,,13,17,56,4,405617013,405617,4056,4,Region IV-A (CALABARZON),Quezon,General Nakar
6050,105535015,San Roque,15535015.0,barangay,,,,U,5600,,,15,35,55,1,105535015,105535,1055,1,Region I (Ilocos Region),Pangasinan,San Manuel
42475,1906601006,Bunut,156601006.0,barangay,,,,R,2200,,,6,1,66,19,1906601006,1906601,19066,19,Bangsamoro Autonomous Region In Muslim Mindana...,Sulu,Indanan
19519,501725005,Gatbo,51725005.0,barangay,,,,R,3097,,,5,25,17,5,501725005,501725,5017,5,Region V (Bicol Region),Camarines Sur,Ocampo
10149,304916032,San Pedro,34916032.0,barangay,,,,R,1660,,,32,16,49,3,304916032,304916,3049,3,Region III (Central Luzon),Nueva Ecija,Lupao
8376,203137044,San Vicente,23137044.0,barangay,,,,R,1738,,,44,37,31,2,203137044,203137,2031,2,Region II (Cagayan Valley),Isabela,Tumauini


In [4]:
clean_mun = (
    df["municipality_or_city"]
    .astype(str)
    .str.lower()
    .str.replace(" ", "")
    .str.replace("(POB.)", "")
)
clean_name = (
    df["name"].astype(str).str.lower().str.replace(" ", "").str.replace("(POB.)", "")
)

df["candidate_hook"] = clean_mun+clean_name

In [5]:
df

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper,region,province_or_huc,municipality_or_city,candidate_hook
0,1300000000,National Capital Region (NCR),130000000.0,region,,,,,13484462,,,000,00,000,13,1300000000,1300000,13000,13,National Capital Region (NCR),,,nannationalcapitalregion(ncr)
1,1380100000,City of Caloocan,137501000.0,city,,HUC,1st,,1661584,,,000,00,801,13,1380100000,1380100,13801,13,National Capital Region (NCR),City of Caloocan,,nancityofcaloocan
2,1380100001,Barangay 1,137501001.0,barangay,,,,U,2319,,,001,00,801,13,1380100001,1380100,13801,13,National Capital Region (NCR),City of Caloocan,,nanbarangay1
3,1380100002,Barangay 2,137501002.0,barangay,,,,U,5156,,,002,00,801,13,1380100002,1380100,13801,13,National Capital Region (NCR),City of Caloocan,,nanbarangay2
4,1380100003,Barangay 3,137501003.0,barangay,,,,U,2497,,,003,00,801,13,1380100003,1380100,13801,13,National Capital Region (NCR),City of Caloocan,,nanbarangay3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,124712037.0,barangay,,,,U,7632,,,006,08,999,19,1999908006,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan,tugunanmanaulanan
43765,1999908007,Pamalian,124712062.0,barangay,,,,R,3256,,,007,08,999,19,1999908007,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan,tugunanpamalian
43766,1999908008,Tapodoc,124717017.0,barangay,,,,R,1767,,,008,08,999,19,1999908008,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan,tugunantapodoc
43767,1999908009,Macabual,124712034.0,barangay,,,,R,4557,,,009,08,999,19,1999908009,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan,tugunanmacabual


In [6]:
from typing import List
def sanitize_input(input_str: str, exclude: List[str] | str | None = None) -> str:
    """
    Removes whitespaces, lowers, and remove all strings listed in exclude
    """
    sanitized_str = input_str.lower()
    if exclude is None:
        return sanitized_str
    
    if isinstance(exclude, list):
        exclude = [x.lower() for x in exclude if isinstance(x, str)]
        for item in exclude:
            sanitized_str.replace(item, "")
        return sanitized_str
    
    return sanitized_str.replace(exclude.lower(),"")



In [7]:
input_str = "BACARRALIBTONG"
sanitized_input = sanitize_input(input_str)

df["sanitized_candidate_hook"] = df["candidate_hook"].apply(
    sanitize_input, args=("(pob.)",)
)
df["score"] = df["sanitized_candidate_hook"].apply(fuzz.ratio, args=(sanitized_input,)).round(1)

In [8]:
df["score"].value_counts().reset_index().sort_values(by="score", ascending=False)

Unnamed: 0,score,count
169,100.0,1
183,80.0,1
168,76.9,1
167,74.3,1
154,72.0,2
...,...,...
184,8.7,1
171,8.3,1
165,7.7,2
186,7.4,1


# Loading Masterlist of Schools
Ginamit ko paid excel sa work to parse data 🤪

In [9]:
import pandas as pd

In [10]:
basedf = pd.read_csv("../resources/hidden/dataset.csv", encoding="iso-8859-1")
basedf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122396 entries, 0 to 122395
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Id             122396 non-null  object
 1   Name           122396 non-null  object
 2   Kind           122396 non-null  object
 3   Data.Column1   122396 non-null  object
 4   Data.Column2   122393 non-null  object
 5   Data.Column3   120886 non-null  object
 6   Data.Column4   54155 non-null   object
 7   Data.Column5   59637 non-null   object
 8   Data.Column6   120764 non-null  object
 9   Data.Column7   61253 non-null   object
 10  Data.Column8   122039 non-null  object
 11  Data.Column9   61441 non-null   object
 12  Data.Column10  120628 non-null  object
 13  Data.Column11  61466 non-null   object
 14  Data.Column12  61469 non-null   object
 15  Data.Column13  121966 non-null  object
dtypes: object(16)
memory usage: 14.9+ MB


  basedf = pd.read_csv("../resources/hidden/dataset.csv", encoding="iso-8859-1")


In [None]:
# first 3 rows r empty
df = basedf[3:]     

# all column names were parsed as rows so i have to know where r these column names
filter = df.apply(lambda row: row.isin(["BEIS School ID"]).any(), axis=1)

# now separating per page (kse nga naging row yung header so alam ko na kapag may header
# that's a page boundary)
slicer0 = df.loc[filter].index.to_list()
slicer1 = slicer0[1:] + [900_000]
slicers = zip(slicer0, slicer1)

from typing import List
import numpy as np

# tapos gagawin ko nang unified dataframe
datasets: List[pd.DataFrame] = []
for start, end in slicers:
    datasets.append(df.loc[start:end])

# super messy ng kakalabasan nito there are columns name "Page001" kasi yun ang na-parse
# ni excel so there would be like 500 columns of "Page###" which I don't want to I have
# to remove that as well
for idx, dataset in enumerate(datasets):
    dataset.columns = dataset.iloc[0]
    dataset = dataset.iloc[1:].iloc[:,1:]
    if np.nan in dataset.columns:
        dataset = dataset.drop(np.nan, axis=1)
    for col in dataset.columns:
        if "page" in col.lower():
            dataset = dataset.drop(col, axis=1)
    datasets[idx] = dataset

    

In [None]:
from IPython.display import display
from tqdm import tqdm


root_df: pd.DataFrame = pd.DataFrame()
for dataset in tqdm(datasets):
    if "Data.Column1" in dataset.columns:
        display(dataset)
        print("Found Data.Column1")
        break
    try:
        root_df = pd.concat(
            [root_df, dataset]
        )
    except Exception as e:
        display(dataset)
        print("found error")
        print(e)
        break

100%|██████████| 545/545 [00:00<00:00, 858.37it/s] 


In [13]:
root_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122392 entries, 4 to 122395
Data columns (total 14 columns):
 #   Column                                       Non-Null Count   Dtype 
---  ------                                       --------------   ----- 
 0   Region                                       122392 non-null  object
 1   Division                                     122392 non-null  object
 2   District                                     120885 non-null  object
 3   BEIS School ID                               121753 non-null  object
 4   School Name                                  122384 non-null  object
 5   Street Address                               118887 non-null  object
 6   Municipality                                 122386 non-null  object
 7   Legislative District                         61467 non-null   object
 8   Barangay                                     61416 non-null   object
 9   Sector                                       61467 non-null   object
 

In [14]:
root_df

Unnamed: 0,Region,Division,District,BEIS School ID,School Name,Street Address,Municipality,Legislative District,Barangay,Sector,Urban/Ru r,School Subclassification,Modified Curricural Offering Classification,Table
4,Region I,Ilocos Norte,Bacarra I,100001,Apaleng-Libtong ES,"Brgy. 21, Libtong, Bacarra, Ilocos Norte",BACARRA,1st District,LIBTONG,Public,Partially U r,DepED Managed,Purely ES,
5,Region I,Ilocos Norte,Bacarra I,100002,Bacarra CES,Santa Rita,BACARRA,1st District,SANTA RITA (POB.),Public,Partially U r,DepED Managed,Purely ES,
6,Region I,Ilocos Norte,Bacarra I,100003,Buyon ES,NONE,BACARRA,1st District,BUYON,Public,Partially U r,DepED Managed,Purely ES,
7,Region I,Ilocos Norte,Bacarra I,100004,Ganagan Elementary School,"#37 Ganagan,Bacarra, Ilocos Norte",BACARRA,1st District,GANAGAN,Public,Partially U r,DepED Managed,Purely ES,
8,Region I,Ilocos Norte,Bacarra I,100005,Macupit ES,Macupit,BACARRA,1st District,MACUPIT,Public,Partially U r,DepED Managed,Purely ES,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122391,NCR,Muntinlupa City,Muntinlupa Ci t,488533,"CBC Integrated School, Inc.","1560 Estanislao Street, Lakeview Homes I",CITY OF MUNTINLUPA,,,,,,,Table
122392,NCR,Muntinlupa City,Muntinlupa Ci t,488544,"The Linden Tree Institute, Inc.",177 Buencamino Street,CITY OF MUNTINLUPA,,,,,,,Table
122393,NCR,Muntinlupa City,Muntinlupa Ci t,488547,Cambridge Children's Learning and Development ...,Lower Ground Level Alabang Town Center,CITY OF MUNTINLUPA,,,,,,,Table
122394,NCR,Muntinlupa City,Muntinlupa Ci t,488548,Holy Word Christian School,"4 Cattleya St., Doña Rosario Bayview Subdivision",CITY OF MUNTINLUPA,,,,,,,Table


In [15]:
basedf.loc[122391]

Id                                               Table001
Name                                Table001 (Page 1-544)
Kind                                                Table
Data.Column1                                          NCR
Data.Column2                              Muntinlupa City
Data.Column3                              Muntinlupa Ci t
Data.Column4                                          NaN
Data.Column5                                          NaN
Data.Column6                                       488533
Data.Column7                                          NaN
Data.Column8                  CBC Integrated School, Inc.
Data.Column9                                          NaN
Data.Column10    1560 Estanislao Street, Lakeview Homes I
Data.Column11                                         NaN
Data.Column12                                         NaN
Data.Column13                          CITY OF MUNTINLUPA
Name: 122391, dtype: object

## Damn, I feel like this is too messy. Maybe excel might not be the best tool to use to parse the data????

In [24]:
# let's try tabula-py
import tabula

In [17]:
df = tabula.read_pdf("../resources/hidden/masterlist.pdf", pages="all")

In [18]:
from tqdm import tqdm
root_df = pd.DataFrame()
for idx, d in enumerate(tqdm(df)):
    d["page"] = idx 
    root_df = pd.concat([root_df, d])


100%|██████████| 544/544 [00:03<00:00, 176.82it/s] 


In [22]:
root_df = root_df.reset_index(drop=True)

In [23]:
root_df[root_df["BEIS School ID"].str.strip().str.isnumeric().notna()]

Unnamed: 0,Region,Division,District,BEIS School ID,School Name,Street Address,Municipality,Legislative District,Barangay,Sector,Urban/Ru,Sacl hColaosls Sifuicbactliaosnsification,Modified Curricural Offering Classification,page
0,Region I,Ilocos Norte,Bacarra I,100001,Apaleng-Libtong ES,"Brgy. 21, Libtong, Bacarra, Ilocos Norte",BACARRA,1st District,LIBTONG,Public,Partially U,bDaenpED Managed,Purely ES,0
1,Region I,Ilocos Norte,Bacarra I,100002,Bacarra CES,Santa Rita,BACARRA,1st District,SANTA RITA (POB.),Public,Partially U,bDaenpED Managed,Purely ES,0
2,Region I,Ilocos Norte,Bacarra I,100003,Buyon ES,NONE,BACARRA,1st District,BUYON,Public,Partially U,bDaenpED Managed,Purely ES,0
3,Region I,Ilocos Norte,Bacarra I,100004,Ganagan Elementary School,"#37 Ganagan,Bacarra, Ilocos Norte",BACARRA,1st District,GANAGAN,Public,Partially U,bDaenpED Managed,Purely ES,0
4,Region I,Ilocos Norte,Bacarra I,100005,Macupit ES,Macupit,BACARRA,1st District,MACUPIT,Public,Partially U,bDaenpED Managed,Purely ES,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60919,NCR,Muntinlupa City,Muntinlupa Ci,y District II488533,"CBC Integrated School, Inc.","1560 Estanislao Street, Lakeview Homes I",CITY OF MUNTINLUPA,Lone District,PUTATAN,Private,Urban,Non-Sectarian,Purely ES,543
60920,NCR,Muntinlupa City,Muntinlupa Ci,y District II488544,"The Linden Tree Institute, Inc.",177 Buencamino Street,CITY OF MUNTINLUPA,Lone District,ALABANG,Private,Urban,Non-Sectarian,ES and JHS (K to 10),543
60921,NCR,Muntinlupa City,Muntinlupa Ci,y District II488547,Cambridge Children's Learning and Development ...,Lower Ground Level Alabang Town Center,CITY OF MUNTINLUPA,Lone District,ALABANG,Private,Urban,Sectarian,Purely ES,543
60922,NCR,Muntinlupa City,Muntinlupa Ci,y District II488548,Holy Word Christian School,"4 Cattleya St., Doña Rosario Bayview Subdivision",CITY OF MUNTINLUPA,Lone District,SUCAT,Private,Urban,Sectarian,ES and JHS (K to 10),543


boom it fucken worked

### Data Cleaning