In [1]:
import json
from typing import Dict, Set

import pandas as pd
import numpy as np
import yaml
from IPython.display import display
from tqdm.notebook import tqdm


from pydantic import ValidationError

from pydantic import BaseModel

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

psg_directory = "../resources/"
psg_data_file = "psgc_2025-08-07.csv"

In [2]:
df = pd.read_csv(psg_directory + psg_data_file)
display(df.info())
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43769 entries, 0 to 43768
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   psgc_id                43769 non-null  int64  
 1   name                   43769 non-null  object 
 2   correspondence_code    43719 non-null  float64
 3   geographic_level       43767 non-null  object 
 4   old_names              1699 non-null   object 
 5   city_class             149 non-null    object 
 6   income_classification  1724 non-null   object 
 7   settlement_type        42011 non-null  object 
 8   population             43762 non-null  object 
 9   Unnamed: 9             75 non-null     object 
 10  barangay_status        2855 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status
0,1300000000,National Capital Region (NCR),130000000.0,Reg,,,,,13484462,,
1,1380100000,City of Caloocan,137501000.0,City,,HUC,1st,,1661584,,
2,1380100001,Barangay 1,137501001.0,Bgy,,,,U,2319,,
3,1380100002,Barangay 2,137501002.0,Bgy,,,,U,5156,,
4,1380100003,Barangay 3,137501003.0,Bgy,,,,U,2497,,
...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,124712037.0,Bgy,,,,U,7632,,
43765,1999908007,Pamalian,124712062.0,Bgy,,,,R,3256,,
43766,1999908008,Tapodoc,124717017.0,Bgy,,,,R,1767,,
43767,1999908009,Macabual,124712034.0,Bgy,,,,R,4557,,


In [3]:
df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df["population"] = df["population"].str.replace(",","").replace("-", np.nan).astype("Int64")

In [4]:
geographic_level_map = {
    "Reg":"region",
    "City":"city",
    "Mun":"municipality",
    "Prov":"province",
    "SubMun":"submunicipality", 
    "Bgy":"barangay"
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipal_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_huc_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipal_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_huc_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]

df.sample(10)

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper
8242,203133008,Calamagui,23133008.0,barangay,,,,R,1343,,,8,33,31,2,203133008,203133,2031,2
37613,1102406023,Poblacion,112406023.0,barangay,,,,U,5379,,,23,6,24,11,1102406023,1102406,11024,11
5615,105524045,Mabulitec,15524045.0,barangay,,,,R,974,,,45,24,55,1,105524045,105524,1055,1
19147,501714017,Barangay IV,51714017.0,barangay,,,,U,2713,,Pob.,17,14,17,5,501714017,501714,5017,5
9162,301406004,Camachilihan,31406004.0,barangay,,,,U,2542,,,4,6,14,3,301406004,301406,3014,3
38264,1108603023,Macol,118603023.0,barangay,,,,R,1381,,,23,3,86,11,1108603023,1108603,11086,11
18103,500508011,Bay,50508011.0,barangay,,,,R,1443,,,11,8,5,5,500508011,500508,5005,5
43537,1908812003,Datalpandan,153825003.0,barangay,,,,R,1998,,,3,12,88,19,1908812003,1908812,19088,19
35407,1001312047,Barangay 4,101312047.0,barangay,,,,U,344,,Pob.,47,12,13,10,1001312047,1001312,10013,10
10044,304911069,Yuson,34911069.0,barangay,,,,R,987,,,69,11,49,3,304911069,304911,3049,3


In [5]:
regions_filter = (
    (df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)
regions_mapper

{'01': 'Region I (Ilocos Region)',
 '02': 'Region II (Cagayan Valley)',
 '03': 'Region III (Central Luzon)',
 '04': 'Region IV-A (CALABARZON)',
 '05': 'Region V (Bicol Region)',
 '06': 'Region VI (Western Visayas)',
 '07': 'Region VII (Central Visayas)',
 '08': 'Region VIII (Eastern Visayas)',
 '09': 'Region IX (Zamboanga Peninsula)',
 '10': 'Region X (Northern Mindanao)',
 '11': 'Region XI (Davao Region)',
 '12': 'Region XII (SOCCSKSARGEN)',
 '13': 'National Capital Region (NCR)',
 '14': 'Cordillera Administrative Region (CAR)',
 '16': 'Region XIII (Caraga)',
 '17': 'MIMAROPA Region',
 '18': 'Negros Island Region (NIR)',
 '19': 'Bangsamoro Autonomous Region In Muslim Mindanao (BARMM)'}

In [6]:
province_or_huc_filter = (
    ~(df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_huc_mapper", "name"]]
    .sort_values("province_or_huc_mapper")
    .set_index("province_or_huc_mapper")
    .to_dict()["name"]
)
province_or_huc_mapper

{'01028': 'Ilocos Norte',
 '01029': 'Ilocos Sur',
 '01033': 'La Union',
 '01055': 'Pangasinan',
 '02009': 'Batanes',
 '02015': 'Cagayan',
 '02031': 'Isabela',
 '02050': 'Nueva Vizcaya',
 '02057': 'Quirino',
 '03008': 'Bataan',
 '03014': 'Bulacan',
 '03049': 'Nueva Ecija',
 '03054': 'Pampanga',
 '03069': 'Tarlac',
 '03071': 'Zambales',
 '03077': 'Aurora',
 '03301': 'City of Angeles',
 '03314': 'City of Olongapo',
 '04010': 'Batangas',
 '04021': 'Cavite',
 '04034': 'Laguna',
 '04056': 'Quezon',
 '04058': 'Rizal',
 '04312': 'City of Lucena',
 '05005': 'Albay',
 '05016': 'Camarines Norte',
 '05017': 'Camarines Sur',
 '05020': 'Catanduanes',
 '05041': 'Masbate',
 '05062': 'Sorsogon',
 '06004': 'Aklan',
 '06006': 'Antique',
 '06019': 'Capiz',
 '06030': 'Iloilo',
 '06079': 'Guimaras',
 '06310': 'City of Iloilo',
 '07012': 'Bohol',
 '07022': 'Cebu',
 '07306': 'City of Cebu',
 '07311': 'City of Lapu-Lapu',
 '07313': 'City of Mandaue',
 '08026': 'Eastern Samar',
 '08037': 'Leyte',
 '08048': 'Nor

In [7]:
municipal_or_city_filter = (
    ~(df["province_or_huc_code"] == "000")
    & ~(df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipal_or_city_mapper", "name"]]
    .sort_values("municipal_or_city_mapper")
    .set_index("municipal_or_city_mapper")
    .to_dict()["name"]
)
municipal_or_city_mapper

{'0102801': 'Adams',
 '0102802': 'Bacarra',
 '0102803': 'Badoc',
 '0102804': 'Bangui',
 '0102805': 'City of Batac',
 '0102806': 'Burgos',
 '0102807': 'Carasi',
 '0102808': 'Currimao',
 '0102809': 'Dingras',
 '0102810': 'Dumalneg',
 '0102811': 'Banna',
 '0102812': 'City of Laoag',
 '0102813': 'Marcos',
 '0102814': 'Nueva Era',
 '0102815': 'Pagudpud',
 '0102816': 'Paoay',
 '0102817': 'Pasuquin',
 '0102818': 'Piddig',
 '0102819': 'Pinili',
 '0102820': 'San Nicolas',
 '0102821': 'Sarrat',
 '0102822': 'Solsona',
 '0102823': 'Vintar',
 '0102901': 'Alilem',
 '0102902': 'Banayoyo',
 '0102903': 'Bantay',
 '0102904': 'Burgos',
 '0102905': 'Cabugao',
 '0102906': 'City of Candon',
 '0102907': 'Caoayan',
 '0102908': 'Cervantes',
 '0102909': 'Galimuyod',
 '0102910': 'Gregorio del Pilar',
 '0102911': 'Lidlidda',
 '0102912': 'Magsingal',
 '0102913': 'Nagbukel',
 '0102914': 'Narvacan',
 '0102915': 'Quirino',
 '0102916': 'Salcedo',
 '0102917': 'San Emilio',
 '0102918': 'San Esteban',
 '0102919': 'San Il

In [8]:
df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_huc"] = df["province_or_huc_mapper"].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipal_or_city_mapper"].map(municipal_or_city_mapper)

In [9]:
barangay_df = df[df["geographic_level"]=="barangay"].reset_index(drop=True)

In [10]:
# building dictionary
empty_municipality = barangay_df["municipality_or_city"].isna()
empty_province_or_huc = barangay_df["province_or_huc"].isna()

In [11]:
mdf = barangay_df[~empty_municipality & ~empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name"
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

empty_municipality_df = barangay_df[empty_municipality & ~empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name"
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

empty_province_df = barangay_df[~empty_municipality & empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name"
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

In [12]:
root_dict: Dict[str, Dict[str, Set[str] | Dict[str, Set]]] = {}
for idx,(i, j, k, l) in mdf[["region","province_or_huc", "municipality_or_city","name"]].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = {}
    if k not in root_dict[i][j].keys():
        root_dict[i][j][k] = set()
    root_dict[i][j][k].add(l)

# handling empty municipality
for idx,(i,j,k) in empty_municipality_df[["region","province_or_huc","name"]].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = set()
    root_dict[i][j].add(k)

# handling empty prov
for idx,(i,j,k) in empty_province_df[["region","municipality_or_city","name"]].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = set()
    root_dict[i][j].add(k)

In [13]:
from typing import Literal, Optional, List

from pydantic import Field


class Location(BaseModel):
    name: str
    type: Literal[
        "country",
        "region",
        "province",
        "city",
        "municipality",
        "barangay",
        "special_geographic_area",
        "submunicipality",
    ]
    psgc_id: str | Literal["n/a"]
    parent_psgc_id: str | Literal["n/a"]
    nicknames: Optional[List[str]] = None
    components: List["Location"] = Field(default_factory=list)

In [65]:
df[(df["province_or_huc"].notna()) & (~df["municipality_or_city"].notna())].sample(10)

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper,region,province_or_huc,municipality_or_city
12045,331400005,East Bajac-bajac,37107005.0,barangay,,,,U,19387,,,5,0,314,3,331400005,331400,3314,3,Region III (Central Luzon),City of Olongapo,
3000,1430300152,Phil-Am,141102152.0,barangay,,,,R,517,,,152,0,303,14,1430300152,1430300,14303,14,Cordillera Administrative Region (CAR),City of Baguio,
38397,1130700090,Pangyan,112402090.0,barangay,,,,U,2340,,,90,0,307,11,1130700090,1130700,11307,11,Region XI (Davao Region),City of Davao,
24807,631000181,Tap-oc,63022181.0,barangay,,,,U,356,,,181,0,310,6,631000181,631000,6310,6,Region VI (Western Visayas),City of Iloilo,
37263,1030900032,Del Carmen,103504032.0,barangay,,,,U,9662,,,32,0,309,10,1030900032,1030900,10309,10,Region X (Northern Mindanao),City of Iligan,
212,1380200017,Talon Dos,137601017.0,barangay,,,,U,43978,,,17,0,802,13,1380200017,1380200,13802,13,National Capital Region (NCR),City of Las Piñas,
1476,1381200009,Kapitolyo,137403009.0,barangay,,,,U,9203,,,9,0,812,13,1381200009,1381200,13812,13,National Capital Region (NCR),City of Pasig,
37206,1030500052,Kauswagan,104305052.0,barangay,,,,U,40239,,,52,0,305,10,1030500052,1030500,10305,10,Region X (Northern Mindanao),City of Cagayan De Oro,
35065,931700010,Boalan,97332010.0,barangay,,,,U,11541,,,10,0,317,9,931700010,931700,9317,9,Region IX (Zamboanga Peninsula),City of Zamboanga,
16426,1705100000,Occidental Mindoro,175100000.0,province,,,1st,,525354,,,0,0,51,17,1705100000,1705100,17051,17,MIMAROPA Region,Occidental Mindoro,


In [49]:
root = Location(
    name="Philippines", psgc_id="0000000000", type="country", parent_psgc_id="n/a"
)
for region in root_dict:
    row = df[df["name"] == region].iloc[0]
    new_location = Location(
        name=row["name"],
        type="region",
        psgc_id=row["psgc_id"],
        parent_psgc_id="0000000000",
    )
    if new_location not in root.components:
        root.components.append(new_location)

# RESOLVE PROVINCES & HUC UNDER REGIONS
for region in root.components:
    provinces_or_hucs_in_region = df[
        (df["region"] == region.name)
        & ~(df["province_or_huc_code"] == "000") # this means that this is a province
        & (df["municipal_or_city_code"] == "00")
        & (df["barangay_code"] == "000")
    ]
    for idx, prov_or_huc in provinces_or_hucs_in_region.iterrows():
        if prov_or_huc["psgc_id"]=="1999900000":
            prov_or_huc["geographic_level"] = "special_geographic_area"
        if prov_or_huc["psgc_id"]=="0990100000":
            prov_or_huc["geographic_level"] = "city" 
        try:
            newloc = Location(
                name=prov_or_huc["name"],
                type=prov_or_huc["geographic_level"],
                psgc_id=prov_or_huc["psgc_id"],
                parent_psgc_id=region.psgc_id,
            )
        except ValidationError as e:
            print("############## ERROR")
            print(prov_or_huc)
        region.components.append(newloc)

# RESOLVE CITIES & MUNICIPALITIES DIRECTLY UNDER REGIONS
for region in root.components:
    municipality_or_city_in_region = df[
        (df["region"] == region.name)
        & ~(df["province_or_huc_code"] == "000") # this means that this is a province
        & ~(df["municipal_or_city_code"] == "00")
        & (df["barangay_code"] == "000")
        & ~(df["province_or_huc"].notna())
    ]
    for idx, mun_or_city in municipality_or_city_in_region.iterrows():
        try:
            newloc = Location(
                name=mun_or_city["name"],
                type=mun_or_city["geographic_level"],
                psgc_id=mun_or_city["psgc_id"],
                parent_psgc_id=region.psgc_id,
            )
        except ValidationError as e:
            print("############## ERROR")
            print(mun_or_city)
        region.components.append(newloc)

# RESOLVE CITIES & MUNICIPALITIES UNDER PROVINCE & HUCs
for region in root.components:
    for province_or_huc in region.components:
        municipality_or_city_in_province_or_huc = df[
            (df["province_or_huc"] == province_or_huc.name)
            & ~(df["province_or_huc_code"] == "000")
            & ~(df["municipal_or_city_code"] == "00")
            & (df["barangay_code"] == "000")
            & (df["province_or_huc"].notna())
            & (df["municipality_or_city"].notna())
        ]
        for idx, mun_or_city in municipality_or_city_in_province_or_huc.iterrows():
            try:
                newloc = Location(
                    name=mun_or_city["name"],
                    type=mun_or_city["geographic_level"],
                    psgc_id=mun_or_city["psgc_id"],
                    parent_psgc_id=province_or_huc.psgc_id,
                )
            except ValidationError as e:
                print(e)
                print("############## ERROR")
                print(mun_or_city)
            province_or_huc.components.append(newloc)

# RESOLVE BARANGAY IF ITS UNDER A MUNICIPALITY OR CITY AND UNDER A PROVINCE OR HUC
for region in tqdm(root.components, leave=True, ascii=True):
    for province_or_huc in region.components:
        for municipality_or_city in province_or_huc.components:
            barangay_in_municipality_or_city = df[
                (df["municipality_or_city"] == municipality_or_city.name)
                & ~(df["province_or_huc_code"] == "000")
                & ~(df["municipal_or_city_code"] == "00")
                & ~(df["barangay_code"] == "000")
                & (df["province_or_huc"].notna())
                & (df["municipality_or_city"].notna())
            ]
            if province_or_huc.psgc_id == "1630400000":
                display(barangay_in_municipality_or_city)
            for idx, barangay in barangay_in_municipality_or_city.iterrows():
                try:
                    newloc = Location(
                        name=barangay["name"],
                        type=barangay["geographic_level"],
                        psgc_id=barangay["psgc_id"],
                        parent_psgc_id=municipality_or_city.psgc_id,
                    )
                except ValidationError as e:
                    print(e)
                    print("############## ERROR")
                    print(barangay)
                municipality_or_city.components.append(newloc)

# RESOLVE BARANGAYS DIRECTLY UNDER PROVINCE OR HUCS
for region in tqdm(root.components, leave=True, ascii=True):
    for province_or_huc in region.components:
        barangay_in_province_or_huc = df[
            (df["province_or_huc"] == province_or_huc.name)
            & ~(df["province_or_huc_code"] == "000")
            & (df["municipal_or_city_code"] == "00")
            & ~(df["barangay_code"] == "000")
            & (df["province_or_huc"].notna())
            & ~(df["municipality_or_city"].notna())
        ]
        for idx, barangay in barangay_in_province_or_huc.iterrows():
            try:
                newloc = Location(
                    name=barangay["name"],
                    type=barangay["geographic_level"],
                    psgc_id=barangay["psgc_id"],
                    parent_psgc_id=province_or_huc.psgc_id,
                )
            except ValidationError as e:
                print(e)
                print("############## ERROR")
                print(province_or_huc)
            province_or_huc.components.append(newloc)

# RESOLVE BARANGAY UNDER MUNICIPALITY THAT IS UNDER REGIONS DIRECTLY
for region in tqdm(root.components, leave=True, ascii=True):
    for municipality_or_city in region.components:
        barangay_in_municipality_or_city = df[
            (df["municipality_or_city"] == municipality_or_city.name)
            & ~(df["province_or_huc_code"] == "000")
            & ~(df["municipal_or_city_code"] == "00")
            & ~(df["barangay_code"] == "000")
            & ~(df["province_or_huc"].notna())
            & (df["municipality_or_city"].notna())
        ]
        for idx, barangay in barangay_in_municipality_or_city.iterrows():
            try:
                newloc = Location(
                    name=barangay["name"],
                    type=barangay["geographic_level"],
                    psgc_id=barangay["psgc_id"],
                    parent_psgc_id=municipality_or_city.psgc_id,
                )
            except ValidationError as e:
                print(e)
                print("############## ERROR")
                print(mun_or_city)
            municipality_or_city.components.append(newloc)


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [59]:
json_dump = json.dumps(root.model_dump(), indent=4)
yaml_dump = yaml.safe_dump(root.model_dump(), sort_keys=False)

In [56]:
with open("../barangay/barangay_extended.json", encoding="utf8", mode="w") as fp:
    fp.write(json_dump)

In [60]:
with open("../barangay/barangay_extended.yaml", encoding="utf8", mode="w") as fp:
    fp.write(yaml_dump)