In [None]:
import json
from typing import Dict, Set

import pandas as pd
import yaml
from IPython.display import display

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

psg_directory = "../resources/"
psg_data_file = "psgc_2025-08-07.csv"

df = pd.read_csv(psg_directory + psg_data_file)

df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

geographic_level_map = {
    "Reg": "region",
    "City": "city",
    "Mun": "municipality",
    "Prov": "province",
    "SubMun": "submunicipality",
    "Bgy": "barangay",
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipality_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_highly_urbanized_city_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipality_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_highly_urbanized_city_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]

regions_filter = (
    (df["province_or_highly_urbanized_city_code"] == "000")
    & (df["municipality_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)

province_or_huc_filter = (
    ~(df["province_or_highly_urbanized_city_code"] == "000")
    & (df["municipality_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_highly_urbanized_city_mapper", "name"]]
    .sort_values("province_or_highly_urbanized_city_mapper")
    .set_index("province_or_highly_urbanized_city_mapper")
    .to_dict()["name"]
)

municipal_or_city_filter = (
    ~(df["province_or_highly_urbanized_city_code"] == "000")
    & ~(df["municipality_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipality_or_city_mapper", "name"]]
    .sort_values("municipality_or_city_mapper")
    .set_index("municipality_or_city_mapper")
    .to_dict()["name"]
)

df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_highly_urbanized_city"] = df["province_or_highly_urbanized_city_mapper"].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipality_or_city_mapper"].map(
    municipal_or_city_mapper
)

barangay_df = df[df["geographic_level"] == "barangay"].reset_index(drop=True)

# Forging SQLite dump

In [None]:
import numpy as np

In [None]:
barangay_df["correspondence_code"] = barangay_df["correspondence_code"].astype("Int64")
not_empty = (barangay_df["correspondence_code"].notna())
barangay_df.loc[not_empty, "correspondence_code"] = barangay_df[not_empty]["correspondence_code"].astype(str).str.zfill(9).replace("<NA>", np.nan)
barangay_df["population"] = pd.to_numeric(barangay_df["population"].str.replace(",","").str.replace('-','')).astype("Int64")
barangay_df["settlement_type"] = barangay_df["settlement_type"].replace("U","urban").replace("R","rural")
barangay_df = barangay_df.rename({"Unnamed: 9":"psgc_extras","old_names":"legacy_name","correspondence_code":"legacy_psgc_id"}, axis=1)

In [None]:
barangay_df[barangay_df["geographic_level"]=="barangay"].head(3)

In [None]:
barangay_df[barangay_df["geographic_level"]=="barangay"]["income_classification"].value_counts(dropna=False)

In [None]:
barangay_df["region_mapper"] = barangay_df["region_mapper"].str.ljust(10, "0")
barangay_df["province_or_highly_urbanized_city_mapper"] = barangay_df["province_or_highly_urbanized_city_mapper"].str.ljust(10, "0")
barangay_df["municipality_or_city_mapper"] = barangay_df["municipality_or_city_mapper"].str.ljust(10, "0")


In [None]:
col_ord = [
    "psgc_id",
    "name",
    "geographic_level",
    "settlement_type",
    "population",
    "psgc_extras",
    "barangay_status",
    "barangay_code",
    "barangay_mapper",
    "municipality_or_city_code",
    "municipality_or_city_mapper",
    "province_or_highly_urbanized_city_code",
    "province_or_highly_urbanized_city_mapper",
    "region_code",
    "region_mapper",
    "legacy_psgc_id",
    "legacy_name",

]
barangay_df[col_ord].sample(10)

In [None]:
barangay_table = barangay_df[col_ord]

In [None]:
barangay_table.info()

In [None]:
import sqlite3

database_name: str = "psgc.db"
conn = sqlite3.connect(database_name)

In [None]:
table_name = "barangay"
barangay_df[col_ord].to_sql(name=table_name,con=conn, if_exists='replace', index=False)

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()


path_string = "abfss://{container}@{storage_account}.dfs.core.windows.net/{path}"


container = os.getenv("container")
storage_account = os.getenv("storage_account")
path = os.getenv("cloud_path")

azure_path = path_string.format(
    container=container, storage_account=storage_account, path=path
)

In [None]:
azure_path

In [None]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .appName("DeltaWriter") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df.write.format("delta").save("abfss://your-container@your-storage.dfs.core.windows.net/delta-table")