In [15]:
import json
from typing import Dict, Set

import pandas as pd
import yaml
from IPython.display import display

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

psg_directory = "../resources/"
psg_data_file = "psgc_2025-08-07.csv"

In [16]:
df = pd.read_csv(psg_directory + psg_data_file)
display(df.info())
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43769 entries, 0 to 43768
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   psgc_id                43769 non-null  int64  
 1   name                   43769 non-null  object 
 2   correspondence_code    43719 non-null  float64
 3   geographic_level       43767 non-null  object 
 4   old_names              1699 non-null   object 
 5   city_class             149 non-null    object 
 6   income_classification  1724 non-null   object 
 7   settlement_type        42011 non-null  object 
 8    population            43762 non-null  object 
 9   Unnamed: 9             75 non-null     object 
 10  barangay_status        2855 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status
0,1300000000,National Capital Region (NCR),130000000.0,Reg,,,,,13484462,,
1,1380100000,City of Caloocan,137501000.0,City,,HUC,1st,,1661584,,
2,1380100001,Barangay 1,137501001.0,Bgy,,,,U,2319,,
3,1380100002,Barangay 2,137501002.0,Bgy,,,,U,5156,,
4,1380100003,Barangay 3,137501003.0,Bgy,,,,U,2497,,
...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,124712037.0,Bgy,,,,U,7632,,
43765,1999908007,Pamalian,124712062.0,Bgy,,,,R,3256,,
43766,1999908008,Tapodoc,124717017.0,Bgy,,,,R,1767,,
43767,1999908009,Macabual,124712034.0,Bgy,,,,R,4557,,


In [17]:
df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [18]:
geographic_level_map = {
    "Reg":"region",
    "City":"city",
    "Mun":"municipality",
    "Prov":"province",
    "SubMun":"submunicipality", 
    "Bgy":"barangay"
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipal_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_huc_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipal_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_huc_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]

df.sample(10)

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper
7419,203103030,Victoria,23103030.0,barangay,,,,R,551,,,30,3,31,2,203103030,203103,2031,2
15231,405622012,Binahian A,45622012.0,barangay,,,,R,415,,,12,22,56,4,405622012,405622,4056,4
17041,1705215036,Jose Leido Jr.,175215036.0,barangay,,,,R,2134,,,36,15,52,17,1705215036,1705215,17052,17
5814,105529012,Poblacion West,15529012.0,barangay,,,,R,1782,,,12,29,55,1,105529012,105529,1055,1
31479,806003002,Acedillo,86003002.0,barangay,,,,R,615,,,2,3,60,8,806003002,806003,8060,8
1552,1381300054,Lourdes,137404054.0,barangay,,,,U,4523,,,54,0,813,13,1381300054,1381300,13813,13
25407,1804529008,San Jose,64529008.0,barangay,,,,U,3176,,,8,29,45,18,1804529008,1804529,18045,18
34441,907328012,Gabunon,97328012.0,barangay,,,,R,2121,,,12,28,73,9,907328012,907328,9073,9
13824,402118020,Lalaan II,42118020.0,barangay,,,,U,7858,,,20,18,21,4,402118020,402118,4021,4
15081,405618020,Dungawan Central,45618020.0,barangay,,,,R,732,,,20,18,56,4,405618020,405618,4056,4


In [19]:
regions_filter = (
    (df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)
regions_mapper

{'01': 'Region I (Ilocos Region)',
 '02': 'Region II (Cagayan Valley)',
 '03': 'Region III (Central Luzon)',
 '04': 'Region IV-A (CALABARZON)',
 '05': 'Region V (Bicol Region)',
 '06': 'Region VI (Western Visayas)',
 '07': 'Region VII (Central Visayas)',
 '08': 'Region VIII (Eastern Visayas)',
 '09': 'Region IX (Zamboanga Peninsula)',
 '10': 'Region X (Northern Mindanao)',
 '11': 'Region XI (Davao Region)',
 '12': 'Region XII (SOCCSKSARGEN)',
 '13': 'National Capital Region (NCR)',
 '14': 'Cordillera Administrative Region (CAR)',
 '16': 'Region XIII (Caraga)',
 '17': 'MIMAROPA Region',
 '18': 'Negros Island Region (NIR)',
 '19': 'Bangsamoro Autonomous Region In Muslim Mindanao (BARMM)'}

In [20]:
province_or_huc_filter = (
    ~(df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_huc_mapper", "name"]]
    .sort_values("province_or_huc_mapper")
    .set_index("province_or_huc_mapper")
    .to_dict()["name"]
)
province_or_huc_mapper

{'01028': 'Ilocos Norte',
 '01029': 'Ilocos Sur',
 '01033': 'La Union',
 '01055': 'Pangasinan',
 '02009': 'Batanes',
 '02015': 'Cagayan',
 '02031': 'Isabela',
 '02050': 'Nueva Vizcaya',
 '02057': 'Quirino',
 '03008': 'Bataan',
 '03014': 'Bulacan',
 '03049': 'Nueva Ecija',
 '03054': 'Pampanga',
 '03069': 'Tarlac',
 '03071': 'Zambales',
 '03077': 'Aurora',
 '03301': 'City of Angeles',
 '03314': 'City of Olongapo',
 '04010': 'Batangas',
 '04021': 'Cavite',
 '04034': 'Laguna',
 '04056': 'Quezon',
 '04058': 'Rizal',
 '04312': 'City of Lucena',
 '05005': 'Albay',
 '05016': 'Camarines Norte',
 '05017': 'Camarines Sur',
 '05020': 'Catanduanes',
 '05041': 'Masbate',
 '05062': 'Sorsogon',
 '06004': 'Aklan',
 '06006': 'Antique',
 '06019': 'Capiz',
 '06030': 'Iloilo',
 '06079': 'Guimaras',
 '06310': 'City of Iloilo',
 '07012': 'Bohol',
 '07022': 'Cebu',
 '07306': 'City of Cebu',
 '07311': 'City of Lapu-Lapu',
 '07313': 'City of Mandaue',
 '08026': 'Eastern Samar',
 '08037': 'Leyte',
 '08048': 'Nor

In [21]:
municipal_or_city_filter = (
    ~(df["province_or_huc_code"] == "000")
    & ~(df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipal_or_city_mapper", "name"]]
    .sort_values("municipal_or_city_mapper")
    .set_index("municipal_or_city_mapper")
    .to_dict()["name"]
)
municipal_or_city_mapper

{'0102801': 'Adams',
 '0102802': 'Bacarra',
 '0102803': 'Badoc',
 '0102804': 'Bangui',
 '0102805': 'City of Batac',
 '0102806': 'Burgos',
 '0102807': 'Carasi',
 '0102808': 'Currimao',
 '0102809': 'Dingras',
 '0102810': 'Dumalneg',
 '0102811': 'Banna',
 '0102812': 'City of Laoag',
 '0102813': 'Marcos',
 '0102814': 'Nueva Era',
 '0102815': 'Pagudpud',
 '0102816': 'Paoay',
 '0102817': 'Pasuquin',
 '0102818': 'Piddig',
 '0102819': 'Pinili',
 '0102820': 'San Nicolas',
 '0102821': 'Sarrat',
 '0102822': 'Solsona',
 '0102823': 'Vintar',
 '0102901': 'Alilem',
 '0102902': 'Banayoyo',
 '0102903': 'Bantay',
 '0102904': 'Burgos',
 '0102905': 'Cabugao',
 '0102906': 'City of Candon',
 '0102907': 'Caoayan',
 '0102908': 'Cervantes',
 '0102909': 'Galimuyod',
 '0102910': 'Gregorio del Pilar',
 '0102911': 'Lidlidda',
 '0102912': 'Magsingal',
 '0102913': 'Nagbukel',
 '0102914': 'Narvacan',
 '0102915': 'Quirino',
 '0102916': 'Salcedo',
 '0102917': 'San Emilio',
 '0102918': 'San Esteban',
 '0102919': 'San Il

In [22]:
df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_huc"] = df["province_or_huc_mapper"].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipal_or_city_mapper"].map(municipal_or_city_mapper)

In [23]:
barangay_df = df[df["geographic_level"]=="barangay"].reset_index(drop=True)

In [24]:
# building dictionary
empty_municipality = barangay_df["municipality_or_city"].isna()
empty_province_or_huc = barangay_df["province_or_huc"].isna()

In [25]:
mdf = barangay_df[~empty_municipality & ~empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name"
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

empty_municipality_df = barangay_df[empty_municipality & ~empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name"
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

empty_province_df = barangay_df[~empty_municipality & empty_province_or_huc][
    [
        "region",
        "region_mapper",
        "province_or_huc",
        "province_or_huc_mapper",
        "municipality_or_city",
        "municipal_or_city_mapper",
        "name"
    ]
].sort_values(["region", "province_or_huc", "municipality_or_city"])

In [30]:
root_dict: Dict[str, Dict[str, Set[str] | Dict[str, Set]]] = {}
for idx,(i, j, k, l) in mdf[["region","province_or_huc", "municipality_or_city","name"]].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = {}
    if k not in root_dict[i][j].keys():
        root_dict[i][j][k] = set()
    root_dict[i][j][k].add(l)

# handling empty municipality
for idx,(i,j,k) in empty_municipality_df[["region","province_or_huc","name"]].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = set()
    root_dict[i][j].add(k)

# handling empty prov
for idx,(i,j,k) in empty_province_df[["region","municipality_or_city","name"]].iterrows():
    if i not in root_dict.keys():
        root_dict[i] = {}
    if j not in root_dict[i].keys():
        root_dict[i][j] = set()
    root_dict[i][j].add(k)

In [None]:
json_str = json.dumps(root_dict, default=lambda o: list(o) if isinstance(o, set) else o, indent=4)
json_dict = json.loads(json_str)
yaml_str = yaml.safe_dump(json_dict)


In [28]:
with open("../barangay/barangay.json", encoding="utf8", mode="w") as file:
    file.write(json_str)

In [29]:
with open("../barangay/barangay.yaml", encoding="utf8", mode="w") as file:
    file.write(yaml_str)