In [1]:
import os
from uuid import uuid4

import numpy as np
import pandas as pd
from clickhouse_connect import get_client
from dotenv import load_dotenv

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

psgc_directory = "../data/geography/"
psgc_data_file = "psgc_2025-08-07.csv"

df = pd.read_csv(psgc_directory + psgc_data_file)

df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

# renaming all geographic level to snake-case
geographic_level_map = {
    "Reg": "region",
    "City": "city",
    "Mun": "municipality",
    "Prov": "province",
    "SubMun": "submunicipality",
    "Bgy": "barangay",
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

# Parsing the PSGC ID and identifying different area codes from it
# PSGC has 
df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipality_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_highly_urbanized_city_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

# Obtaining mappable
df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipality_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_highly_urbanized_city_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]
regions_filter = (
    (df["province_or_highly_urbanized_city_code"] == "000")
    & (df["municipality_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)
province_or_huc_filter = (
    ~(df["province_or_highly_urbanized_city_code"] == "000")
    & (df["municipality_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_highly_urbanized_city_mapper", "name"]]
    .sort_values("province_or_highly_urbanized_city_mapper")
    .set_index("province_or_highly_urbanized_city_mapper")
    .to_dict()["name"]
)
municipal_or_city_filter = (
    ~(df["province_or_highly_urbanized_city_code"] == "000")
    & ~(df["municipality_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipality_or_city_mapper", "name"]]
    .sort_values("municipality_or_city_mapper")
    .set_index("municipality_or_city_mapper")
    .to_dict()["name"]
)
df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_highly_urbanized_city"] = df[
    "province_or_highly_urbanized_city_mapper"
].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipality_or_city_mapper"].map(
    municipal_or_city_mapper
)

In [2]:
df

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipality_or_city_code,province_or_highly_urbanized_city_code,region_code,barangay_mapper,municipality_or_city_mapper,province_or_highly_urbanized_city_mapper,region_mapper,region,province_or_highly_urbanized_city,municipality_or_city
0,1300000000,National Capital Region (NCR),130000000.0,region,,,,,13484462,,,000,00,000,13,1300000000,1300000,13000,13,National Capital Region (NCR),,
1,1380100000,City of Caloocan,137501000.0,city,,HUC,1st,,1661584,,,000,00,801,13,1380100000,1380100,13801,13,National Capital Region (NCR),City of Caloocan,
2,1380100001,Barangay 1,137501001.0,barangay,,,,U,2319,,,001,00,801,13,1380100001,1380100,13801,13,National Capital Region (NCR),City of Caloocan,
3,1380100002,Barangay 2,137501002.0,barangay,,,,U,5156,,,002,00,801,13,1380100002,1380100,13801,13,National Capital Region (NCR),City of Caloocan,
4,1380100003,Barangay 3,137501003.0,barangay,,,,U,2497,,,003,00,801,13,1380100003,1380100,13801,13,National Capital Region (NCR),City of Caloocan,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,124712037.0,barangay,,,,U,7632,,,006,08,999,19,1999908006,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan
43765,1999908007,Pamalian,124712062.0,barangay,,,,R,3256,,,007,08,999,19,1999908007,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan
43766,1999908008,Tapodoc,124717017.0,barangay,,,,R,1767,,,008,08,999,19,1999908008,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan
43767,1999908009,Macabual,124712034.0,barangay,,,,R,4557,,,009,08,999,19,1999908009,1999908,19999,19,Bangsamoro Autonomous Region In Muslim Mindana...,Special Geographic Area,Tugunan


# Barangay dimension

In [3]:
barangay_df = df.reset_index(drop=True)
barangay_df["correspondence_code"] = barangay_df["correspondence_code"].astype("Int64")
not_empty = barangay_df["correspondence_code"].notna()
barangay_df.loc[not_empty, "correspondence_code"] = (
    barangay_df[not_empty]["correspondence_code"]
    .astype(str)
    .str.zfill(9)
    .replace("<NA>", "null")
)
barangay_df["correspondence_code"] = barangay_df["correspondence_code"].astype(str)
barangay_df["population"] = pd.to_numeric(
    barangay_df["population"].str.replace(",", "").str.replace("-", "")
).astype("Int64")
barangay_df["settlement_type"] = (
    barangay_df["settlement_type"].replace("U", "urban").replace("R", "rural")
)
barangay_df = barangay_df.rename(
    {
        "Unnamed: 9": "psgc_extras",
        "old_names": "legacy_psgc_name",
        "correspondence_code": "legacy_psgc_id",
        "name": "psgc_name",
    },
    axis=1,
)

barangay_df["region_mapper"] = barangay_df["region_mapper"].str.ljust(10, "0")
barangay_df["province_or_highly_urbanized_city_mapper"] = barangay_df[
    "province_or_highly_urbanized_city_mapper"
].str.ljust(10, "0")
barangay_df["municipality_or_city_mapper"] = barangay_df[
    "municipality_or_city_mapper"
].str.ljust(10, "0")

col_ord = [
    "psgc_id",
    "psgc_name",
    "geographic_level",
    "settlement_type",
    "income_classification",
    "city_class",
    "psgc_extras",
    "barangay_status",
    "barangay_code",
    "barangay_mapper",
    "municipality_or_city_code",
    "municipality_or_city_mapper",
    "province_or_highly_urbanized_city_code",
    "province_or_highly_urbanized_city_mapper",
    "region_code",
    "region_mapper",
    "legacy_psgc_id",
    "legacy_psgc_name",
]
barangay_table = barangay_df[col_ord]

for dtype, index in zip(barangay_table.dtypes, barangay_table.dtypes.index):
    if dtype == "object":
        print(index)
        barangay_table.loc[:, index] = barangay_table.loc[:, index].replace(np.nan, "")
    if dtype == "Int64":
        barangay_table.loc[:, index] = barangay_table.loc[:, index].replace(np.nan, 0)

barangay_table["surrogate_id"] = [uuid4() for _ in range(len(barangay_table))]
barangay_table["valid_from"] = pd.Timestamp.now(tz='UTC').tz_convert('Asia/Manila')

psgc_id
psgc_name
geographic_level
settlement_type
income_classification
city_class
psgc_extras
barangay_status
barangay_code
barangay_mapper
municipality_or_city_code
municipality_or_city_mapper
province_or_highly_urbanized_city_code
province_or_highly_urbanized_city_mapper
region_code
region_mapper
legacy_psgc_id
legacy_psgc_name


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barangay_table["surrogate_id"] = [uuid4() for _ in range(len(barangay_table))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barangay_table["valid_from"] = pd.Timestamp.now(tz='UTC').tz_convert('Asia/Manila')


In [4]:
barangay_table

Unnamed: 0,psgc_id,psgc_name,geographic_level,settlement_type,income_classification,city_class,psgc_extras,barangay_status,barangay_code,barangay_mapper,municipality_or_city_code,municipality_or_city_mapper,province_or_highly_urbanized_city_code,province_or_highly_urbanized_city_mapper,region_code,region_mapper,legacy_psgc_id,legacy_psgc_name,surrogate_id,valid_from
0,1300000000,National Capital Region (NCR),region,,,,,,000,1300000000,00,1300000000,000,1300000000,13,1300000000,130000000,,15e447ad-8094-4a63-963d-691574cec5af,2025-08-28 23:31:17.432713+08:00
1,1380100000,City of Caloocan,city,,1st,HUC,,,000,1380100000,00,1380100000,801,1380100000,13,1300000000,137501000,,0c149f70-47f4-4e2c-b1e0-03e8255e0144,2025-08-28 23:31:17.432713+08:00
2,1380100001,Barangay 1,barangay,urban,,,,,001,1380100001,00,1380100000,801,1380100000,13,1300000000,137501001,,6c429bc3-900b-4fa7-9ad2-d1eef4b49b8f,2025-08-28 23:31:17.432713+08:00
3,1380100002,Barangay 2,barangay,urban,,,,,002,1380100002,00,1380100000,801,1380100000,13,1300000000,137501002,,0a8c2903-7a92-4d65-85d7-74eec2e3a7dd,2025-08-28 23:31:17.432713+08:00
4,1380100003,Barangay 3,barangay,urban,,,,,003,1380100003,00,1380100000,801,1380100000,13,1300000000,137501003,,cf03f41a-142b-4053-b0ca-af9b1ec169e0,2025-08-28 23:31:17.432713+08:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,barangay,urban,,,,,006,1999908006,08,1999908000,999,1999900000,19,1900000000,124712037,,5b0e1e72-f2b1-462f-8921-04e83aa68736,2025-08-28 23:31:17.432713+08:00
43765,1999908007,Pamalian,barangay,rural,,,,,007,1999908007,08,1999908000,999,1999900000,19,1900000000,124712062,,82122098-8eda-426f-a44d-3bb999c9f237,2025-08-28 23:31:17.432713+08:00
43766,1999908008,Tapodoc,barangay,rural,,,,,008,1999908008,08,1999908000,999,1999900000,19,1900000000,124717017,,699c71e4-6d2b-4b2f-ad9e-c9b55a05afbc,2025-08-28 23:31:17.432713+08:00
43767,1999908009,Macabual,barangay,rural,,,,,009,1999908009,08,1999908000,999,1999900000,19,1900000000,124712034,,6ecd0e4c-c5cf-4fa6-a398-4b6aaec82259,2025-08-28 23:31:17.432713+08:00


In [None]:
load_dotenv()

client_password: str = os.getenv("clickhouse_password") or ""
client_username: str = os.getenv("clickhouse_username") or ""

# Connect to ClickHouse
client = get_client(
    host="localhost", port=18123, username=client_username, password=client_password
)

In [None]:
# Insert DataFrame
response = client.insert_df("psgc.dim_administrative_area", barangay_table)

In [None]:
response.summary

In [None]:
barangay_df

In [None]:
poptable.info()

In [None]:
col_ord2 = [
    "psgc_id",
    "population",
]
poptable = barangay_df[col_ord2]

for dtype, index in zip(poptable.dtypes, poptable.dtypes.index):
    if dtype == "object":
        print(index)
        poptable.loc[:, index] = poptable.loc[:, index].replace(np.nan, "")
    if dtype == "Int64":
        poptable.loc[:, index] = poptable.loc[:, index].replace(np.nan, 0)
        poptable.loc[:, index] = poptable.loc[:, index].replace(None, "")


poptable["surrogate_id"] = [uuid4() for _ in range(len(poptable))]
poptable["valid_from"] = pd.Timestamp.now(tz='UTC').tz_convert('Asia/Manila')

In [None]:
poptable["population"].isna().value_counts()

In [None]:
poptable.info()

In [None]:
poptable

In [None]:
# Insert DataFrame
response = client.insert_df("psgc.fact_population_by_administrative_area", poptable)

In [None]:
response.summary

### I want to use the barangay table in fuzzy match notebook so I'm saving it as data

In [6]:
barangay_table["surrogate_id"] = barangay_table["surrogate_id"].astype(bytes)
barangay_table.to_parquet("../data/geography/administrative_regions.parquet")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barangay_table["surrogate_id"] = barangay_table["surrogate_id"].astype(bytes)
