In [1]:
import json
from typing import Dict, Set

import pandas as pd
import yaml
from IPython.display import display
from rapidfuzz import fuzz

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

psg_directory = "../data/geography/"
psg_data_file = "psgc_2025-08-07.csv"

In [2]:
df = pd.read_csv(psg_directory + psg_data_file)
display(df.info())
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43769 entries, 0 to 43768
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   psgc_id                43769 non-null  int64  
 1   name                   43769 non-null  object 
 2   correspondence_code    43719 non-null  float64
 3   geographic_level       43767 non-null  object 
 4   old_names              1699 non-null   object 
 5   city_class             149 non-null    object 
 6   income_classification  1724 non-null   object 
 7   settlement_type        42011 non-null  object 
 8   population             43762 non-null  object 
 9   Unnamed: 9             75 non-null     object 
 10  barangay_status        2855 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status
0,1300000000,National Capital Region (NCR),130000000.0,Reg,,,,,13484462,,
1,1380100000,City of Caloocan,137501000.0,City,,HUC,1st,,1661584,,
2,1380100001,Barangay 1,137501001.0,Bgy,,,,U,2319,,
3,1380100002,Barangay 2,137501002.0,Bgy,,,,U,5156,,
4,1380100003,Barangay 3,137501003.0,Bgy,,,,U,2497,,
...,...,...,...,...,...,...,...,...,...,...,...
43764,1999908006,Manaulanan,124712037.0,Bgy,,,,U,7632,,
43765,1999908007,Pamalian,124712062.0,Bgy,,,,R,3256,,
43766,1999908008,Tapodoc,124717017.0,Bgy,,,,R,1767,,
43767,1999908009,Macabual,124712034.0,Bgy,,,,R,4557,,


In [3]:
# this code is just copied from my barangay project so there are more explanations there
# i think

df["psgc_id"] = df["psgc_id"].astype(str).str.zfill(10)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

geographic_level_map = {
    "Reg": "region",
    "City": "city",
    "Mun": "municipality",
    "Prov": "province",
    "SubMun": "submunicipality",
    "Bgy": "barangay",
}
df["geographic_level"] = df["geographic_level"].replace(geographic_level_map)

df["barangay_code"] = df["psgc_id"].str[-3:]
df["municipal_or_city_code"] = df["psgc_id"].str[-5:-3]
df["province_or_huc_code"] = df["psgc_id"].str[-8:-5]
df["region_code"] = df["psgc_id"].str[-10:-8]

df["barangay_mapper"] = df["psgc_id"].str[-10:]
df["municipal_or_city_mapper"] = df["psgc_id"].str[-10:-3]
df["province_or_huc_mapper"] = df["psgc_id"].str[-10:-5]
df["region_mapper"] = df["psgc_id"].str[-10:-8]

df.sample(10)

regions_filter = (
    (df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)
regions_mapper = (
    df.loc[regions_filter, ["region_mapper", "name"]]
    .sort_values("region_mapper")
    .set_index("region_mapper", drop=True)
    .to_dict()["name"]
)


province_or_huc_filter = (
    ~(df["province_or_huc_code"] == "000")
    & (df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

province_or_huc_mapper = (
    df.loc[province_or_huc_filter, ["province_or_huc_mapper", "name"]]
    .sort_values("province_or_huc_mapper")
    .set_index("province_or_huc_mapper")
    .to_dict()["name"]
)
municipal_or_city_filter = (
    ~(df["province_or_huc_code"] == "000")
    & ~(df["municipal_or_city_code"] == "00")
    & (df["barangay_code"] == "000")
)

municipal_or_city_mapper = (
    df.loc[municipal_or_city_filter, ["municipal_or_city_mapper", "name"]]
    .sort_values("municipal_or_city_mapper")
    .set_index("municipal_or_city_mapper")
    .to_dict()["name"]
)

df["region"] = df["region_mapper"].map(regions_mapper)
df["province_or_huc"] = df["province_or_huc_mapper"].map(province_or_huc_mapper)
df["municipality_or_city"] = df["municipal_or_city_mapper"].map(
    municipal_or_city_mapper
)
display(df.sample(10))

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper,region,province_or_huc,municipality_or_city
26746,701223016,Lombog,71223016.0,barangay,,,,R,2149,,,16,23,12,7,701223016,701223,7012,7,Region VII (Central Visayas),Bohol,Guindulman
24665,631000021,Bolilao,63022021.0,barangay,,,,U,5332,,,21,0,310,6,631000021,631000,6310,6,Region VI (Western Visayas),City of Iloilo,
25975,1804622012,Lutay,74622012.0,barangay,,,,R,1207,,,12,22,46,18,1804622012,1804622,18046,18,Negros Island Region (NIR),Negros Oriental,Tayasan
42629,1906609011,Danapa,156609011.0,barangay,,,,R,1202,,,11,9,66,19,1906609011,1906609,19066,19,Bangsamoro Autonomous Region In Muslim Mindana...,Sulu,Parang
3636,102903017,Paing,12903017.0,barangay,,,,R,1625,,,17,3,29,1,102903017,102903,1029,1,Region I (Ilocos Region),Ilocos Sur,Bantay
33366,907207013,Saluyong,97207013.0,barangay,,,,R,1155,,,13,7,72,9,907207013,907207,9072,9,Region IX (Zamboanga Peninsula),Zamboanga del Norte,Manukan
29397,803702048,Veteranos,83702048.0,barangay,,,,R,620,,,48,2,37,8,803702048,803702,8037,8,Region VIII (Eastern Visayas),Leyte,Alangalang
33391,907208011,San Miguel,97208011.0,barangay,,,,R,628,,,11,8,72,9,907208011,907208,9072,9,Region IX (Zamboanga Peninsula),Zamboanga del Norte,Mutia
43356,1908712005,Kuden,153821006.0,barangay,,,,R,2082,,,5,12,87,19,1908712005,1908712,19087,19,Bangsamoro Autonomous Region In Muslim Mindana...,Maguindanao del Norte,Talitay
29157,802618020,Santa Margarita,82618020.0,barangay,,,,R,676,,,20,18,26,8,802618020,802618,8026,8,Region VIII (Eastern Visayas),Eastern Samar,Quinapondan


In [4]:
clean_mun = (
    df["municipality_or_city"]
    .astype(str)
    .str.lower()
    .str.replace(" ", "")
    .str.replace("(POB.)", "")
)
clean_name = (
    df["name"].astype(str).str.lower().str.replace(" ", "").str.replace("(POB.)", "")
)

df["candidate_hook"] = clean_mun + clean_name

In [5]:
df.sample(10)

Unnamed: 0,psgc_id,name,correspondence_code,geographic_level,old_names,city_class,income_classification,settlement_type,population,Unnamed: 9,barangay_status,barangay_code,municipal_or_city_code,province_or_huc_code,region_code,barangay_mapper,municipal_or_city_mapper,province_or_huc_mapper,region_mapper,region,province_or_huc,municipality_or_city,candidate_hook
39198,1206319009,Lamfugon,126319009.0,barangay,,,,U,6846,,,9,19,63,12,1206319009,1206319,12063,12,Region XII (SOCCSKSARGEN),South Cotabato,Lake Sebu,lakesebulamfugon
26137,1806105011,Solangon,76105011.0,barangay,,,,R,1419,,,11,5,61,18,1806105011,1806105,18061,18,Negros Island Region (NIR),Siquijor,San Juan,sanjuansolangon
5576,105524003,Alacan,15524003.0,barangay,,,,R,2013,,,3,24,55,1,105524003,105524,1055,1,Region I (Ilocos Region),Pangasinan,Malasiqui,malasiquialacan
38168,1108209025,Sasa,118209025.0,barangay,,,,R,619,,,25,9,82,11,1108209025,1108209,11082,11,Region XI (Davao Region),Davao de Oro,Nabunturan,nabunturansasa
1657,1381400016,Salapan,137405016.0,barangay,,,,U,8865,,,16,0,814,13,1381400016,1381400,13814,13,National Capital Region (NCR),City of San Juan,,nansalapan
21600,600413019,San Roque,60413019.0,barangay,,,,R,1485,,,19,13,4,6,600413019,600413,6004,6,Region VI (Western Visayas),Aklan,Malinao,malinaosanroque
36665,1004216007,Cahayag,104216007.0,barangay,,,,R,428,,,7,16,42,10,1004216007,1004216,10042,10,Region X (Northern Mindanao),Misamis Occidental,Tudela,tudelacahayag
17511,1705906011,Poblacion,175906011.0,barangay,,,,R,570,,,11,6,59,17,1705906011,1705906,17059,17,MIMAROPA Region,Romblon,Corcuera,corcuerapoblacion
37582,1102404017,Malabang Damsite,112404017.0,barangay,,,,R,1142,,,17,4,24,11,1102404017,1102404,11024,11,Region XI (Davao Region),Davao del Sur,Hagonoy,hagonoymalabangdamsite
34392,907325005,Daplayan,97325005.0,barangay,,,,R,988,,,5,25,73,9,907325005,907325,9073,9,Region IX (Zamboanga Peninsula),Zamboanga del Sur,San Pablo,sanpablodaplayan


In [6]:
from typing import List


def sanitize_input(input_str: str, exclude: List[str] | str | None = None) -> str:
    """
    Removes whitespaces, lowers, and remove all strings listed in exclude
    """
    sanitized_str = input_str.lower()
    if exclude is None:
        return sanitized_str

    if isinstance(exclude, list):
        exclude = [x.lower() for x in exclude if isinstance(x, str)]
        for item in exclude:
            sanitized_str.replace(item, "")
        return sanitized_str

    return sanitized_str.replace(exclude.lower(), "")

In [7]:
input_str = "BACARRALIBTONG"
sanitized_input = sanitize_input(input_str)

df["sanitized_candidate_hook"] = df["candidate_hook"].apply(
    sanitize_input, args=("(pob.)",)
)
df["score"] = (
    df["sanitized_candidate_hook"].apply(fuzz.ratio, args=(sanitized_input,)).round(1)
)

In [8]:
df["score"].value_counts().reset_index().sort_values(by="score", ascending=False)

Unnamed: 0,score,count
169,100.0,1
183,80.0,1
168,76.9,1
167,74.3,1
154,72.0,2
...,...,...
184,8.7,1
171,8.3,1
165,7.7,2
186,7.4,1


I used excel to parse the PDF... after long battle, didn't work...

Now let's use tabula-py (port of tabula from java)

In [9]:
# let's try tabula-py
import tabula

In [10]:
df = tabula.read_pdf("../data/education/masterlist.pdf", pages="all")

Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


In [11]:
from tqdm import tqdm

root_df = pd.DataFrame()
for idx, d in enumerate(tqdm(df)):
    d["page"] = idx
    root_df = pd.concat([root_df, d])


100%|██████████| 544/544 [00:02<00:00, 182.36it/s] 


In [12]:
root_df = root_df.reset_index(drop=True)

In [13]:
educdf = root_df[root_df["BEIS School ID"].str.strip().str.isnumeric().notna()]

# Data Cleaning!

In [14]:
root_df.sample(10)

Unnamed: 0,Region,Division,District,BEIS School ID,School Name,Street Address,Municipality,Legislative District,Barangay,Sector,Urban/Ru,Sacl hColaosls Sifuicbactliaosnsification,Modified Curricural Offering Classification,page
14278,Region IV-A,Quezon,Catanauan,301311,Doongan Ilaya National High School,,CATANAUAN,3rd District,DOONGAN ILAYA,Public,Partially U,bDaenpED Managed,JHS with SHS,127
5304,Region II,Isabela,Tumauini Sou,h103891,Balug Elementary School,"BALUG, TUMAUINI, ISABELA",TUMAUINI,1st District,BALUG,Public,Partially U,bDaenpED Managed,Purely ES,47
60324,NCR,Las Piñas City,Las Piñas City I,408287,The Little Apprentice Preschool Inc.,"Ground Floor, EVIA North, Daang Hari",CITY OF LAS PIÑAS,Lone District,ALMANZA DOS,Private,Urban,Non-Sectarian,Purely ES,538
11136,Region III,Tarlac City,Tarlac Central,istrict489501,"Holy Triune God Learning School, Inc.",514 Blk 5,CITY OF TARLAC (Capital),2nd District,SAN NICOLAS,Private,Partially U,bNaonn-Sectarian,Purely ES,99
55057,BARMM,Sulu,Talipao,217063,Taraji Primary School,-,TALIPAO,1st District,LOWER KAMUNTAYAN,Public,Partially U,bDaenpED Managed,Purely ES,491
60006,NCR,Pasig City,Pasig City Dist,ict V485613,"Northridge Grade School and Therapy Center, Inc.",59 Kalinangan Street corner C. Raymundo Avenue,CITY OF PASIG,Lone District,CANIOGAN,Private,Urban,Non-Sectarian,Purely ES,535
45259,Region XI,Davao De Oro,Laak,128285,Bayanihan ES,"Purok 1, Bayanihan",LAAK (SAN VICENTE),2nd District,EL KATIPUNAN,Public,Partially U,bDaenpED Managed,Purely ES,404
12195,Region IV-A,Batangas,Mabini,107463,San Teodoro Elementary School,"San Teodoro, Mabini, Batangas",MABINI,2nd District,SAN TEODORO,Public,Partially U,bDaenpED Managed,Purely ES,108
28664,Region VI,Negros Occidental,Moises Padilla,302646,Guinpana-an NHS,PUROK 3,MOISES PADILLA (MAGALLON),5th District,GUINPANA-AN,Public,Partially U,bDaenpED Managed,JHS with SHS,255
60556,NCR,Malabon City,Malabon Distr,ct III487506,Malabon Educational Institution -Arellano Univ...,"Gov. Pascual Avenue, Malabon City",CITY OF MALABON,Lone District,BARITAN,Private,Urban,Non-Sectarian,Purely ES,540


In [15]:
for col in root_df.columns:
    print(col)

Region
Division
District
BEIS School ID
School Name
Street Address
Municipality
Legislative District
Barangay
Sector
Urban/Ru
Sacl hColaosls Sifuicbactliaosnsification
Modified Curricural Offering Classification
page


In [16]:
correct_column_names = {
    "Region": "region",
    "Division": "division",
    "District": "district",
    "BEIS School ID": "beis_school_id",
    "School Name": "school_name",
    "Street Address": "street_address",
    "Municipality": "municipality",
    "Legislative District": "legislative_district",
    "Barangay": "barangay",
    "Sector": "sector",
    "Urban/Ru": "settlement_type",
    "Sacl hColaosls Sifuicbactliaosnsification": "school_subclassification",
    "Modified Curricural Offering Classification": "modified_cultural_offering_classification",
    "page": "page",
}

In [17]:
root_df = root_df.rename(correct_column_names, axis=1)

# Fixing Categories
There are a few categorical data in here that got messed up during the parsing of the PDF File

In [18]:
root_df.sample(10)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
13499,Region IV-A,Laguna,Alaminos,402630,Saint Therese of the Child Jesus School (main),"#15 Topaz St., St. Francis Homes 3 San Pedro, ...",SAN PEDRO,1st District,SAN ANTONIO,Private,Urban,Sectarian,All Offering (K to 12),120
23243,Region V,Masbate,Baleno,302144,Magdalena National High School,Magdalena,BALENO,2nd District,MAGDALENA,Public,Partially U,bDaenpED Managed,JHS with SHS,207
56749,CAR,Benguet,Tuba,135685,Andolor ES,Andolor,TUBA,Lone District,TABAAN SUR,Public,Partially U,bDaenpED Managed,Purely ES,506
21357,Region V,Camarines Norte,Jose Panganib,n West112168,San Martin ES,-Barangay San Martin,JOSE PANGANIBAN,1st District,SAN MARTIN,Public,Partially U,bDaenpED Managed,Purely ES,190
23201,Region V,Masbate,Aroroy West,113406,Macabug ES,,AROROY,2nd District,MACABUG,Public,Partially U,bDaenpED Managed,Purely ES,207
49012,Region XII,Sarangani,South Malung,n130613,Malungon Central Elementary School SPED Center,Poblacion,MALUNGON,Lone District,POBLACION,Public,Partially U,bDaenpED Managed,Purely ES,437
6649,Region III,Bataan,Abucay,104537,P. Rubiano ES,P. Sacdalan,ABUCAY,1st District,MABATANG,Public,Partially U,bDaenpED Managed,Purely ES,59
39756,Region IX,Zamboanga del Norte,Salug I,124623,Salug CS,RAMON MAGSAYSAY,SALUG,3rd District,POBLACION EAST,Public,Partially U,bDaenpED Managed,Purely ES,355
37796,Region VIII,Northern Samar,Pambujan II,123174,Don Sixto Balanquit Elementary School,Purok I,PAMBUJAN,2nd District,"SIXTO T. BALANGUIT, SR.",Public,Rural,DepED Managed,Purely ES,337
16522,Region IV-A,San Pablo City,Sto. Angel,109803,Antonia Manuel Magcase Elementary School,Brgy. Sta. Isabel,SAN PABLO CITY,3rd District,SANTA ISABEL,Public,Urban,DepED Managed,Purely ES,147


In [19]:
root_df["sector"].value_counts(dropna=False)
# looks clean, bet lets convert to snake case

sector
Public       47421
Private      13256
SUCs/LUCs      247
Name: count, dtype: int64

In [20]:
root_df["sector"] = root_df["sector"].replace("Public", "public")
root_df["sector"] = root_df["sector"].replace("Private", "private")
root_df["sector"] = root_df["sector"].replace("SUCs/LUCs", "suc_luc")

In [21]:
root_df.sample(10)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
46460,Region XI,Davao Oriental,Gov. Generos,South129288,Aguinaldo Elementary School,PUROK 1,GOVERNOR GENEROSO,2nd District,SUROP,public,Partially U,bDaenpED Managed,Purely ES,414
37343,Region VIII,Northern Samar,San Roque,123245,SAN ROQUE CS,SAN ISIDRO ST.,SAN ROQUE,2nd District,ZONE 5 (POB.),public,Partially U,bDaenpED Managed,Purely ES,333
52788,CARAGA,Dinagat Island,Dinagat,324406,Primitivo J. Ebol Memorial National High School,"Magsaysay, Dinagat, Dinagat Islands",DINAGAT,Lone District,MAGSAYSAY,public,Partially U,bDaenpED Managed,JHS with SHS,471
9578,Region III,Tarlac,Capas West,160018,Manabayukan ES,Manabayukan,CAPAS,1st District,O'DONNELL,public,Partially U,bDaenpED Managed,Purely ES,85
24091,Region V,Sorsogon,Castilla West,114121,Canjela ES,"Canjela, Castilla, Sorsogon",CASTILLA,1st District,CANJELA,public,Partially U,bDaenpED Managed,Purely ES,215
40052,Region IX,Zamboanga del Norte,Tampilisan,124862,Tampilisan CS,"Pob. tampilisan, Z.N",TAMPILISAN,3rd District,POBLACION (TAMPILISAN),public,Partially U,bDaenpED Managed,Purely ES,357
59483,NCR,Caloocan City,Caloocan Nort,IV483565,"St. Teresa of Avila Academy, Inc.","Block 7 Lot 1 Phase IV, Tierra Nova, Bagumbong",KALOOKAN CITY,1st District,BARANGAY 171,private,Urban,Non-Sectarian,ES and JHS (K to 10),531
39788,Region IX,Zamboanga del Norte,Sergio Osmeñ,I124648,San Jose ES,"SAN JOSE,SERGIO OSMEÑA SR.",SERGIO OSMEÑA SR.,1st District,SAN JOSE,public,Partially U,bDaenpED Managed,Purely ES,355
51618,CARAGA,Butuan City,East Butuan D,strict II132035,Mahayahay ES,"-Purok-2 Mahayahay, Anticala, Butuan City",BUTUAN CITY (Capital),1st District,ANTICALA,public,Partially U,bDaenpED Managed,Purely ES,460
25894,Region VI,Antique,San Jose,438513,Advance Central College,Salazar Street,SAN JOSE (Capital),Lone District,BARANGAY 1 (POB.),private,Partially U,bNaonn-Sectarian,JHS with SHS,231


In [22]:
root_df["settlement_type"].value_counts(dropna=False)
# lets correct categories

settlement_type
Partially U    47606
Urban          10404
Rural           2914
Name: count, dtype: int64

In [23]:
root_df["settlement_type"] = root_df["settlement_type"].replace(
    "Partially U", "partially_urban"
)
root_df["settlement_type"] = root_df["settlement_type"].replace("Urban", "urban")
root_df["settlement_type"] = root_df["settlement_type"].replace("Rural", "rural")


In [24]:
root_df["settlement_type"].value_counts()

settlement_type
partially_urban    47606
urban              10404
rural               2914
Name: count, dtype: int64

In [25]:
root_df["school_subclassification"].value_counts(dropna=False)
# now, that's dirty

school_subclassification
bDaenpED Managed                 40457
DepED Managed                     6795
Non-Sectarian                     5244
bNaonn-Sectarian                  4252
bSeacntarian                      2595
Sectarian                         1163
bSUanC Managed                     160
bLoacnal Government                109
SUC Managed                         50
Local Government                    45
bLUanC                              21
LUC                                 16
bDaOnST Managed                      9
DOST Managed                         4
bLoacnal International School        2
bOatnher GA Managed                  1
Other GA Managed                     1
Name: count, dtype: int64

Here, I have to double check the real values in the PDF via reading it manually. Then I'll create the dictionary renamer later

In [26]:
root_df[root_df["school_subclassification"] == "bSUanC Managed"].sample(2)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
22727,Region V,Camarines Sur,Sipocot North,600056,Central Bicol State University of Agriculture ...,"Coloy-coloy, Impig, Sipocot",SIPOCOT,1st District,IMPIG,suc_luc,partially_urban,bSUanC Managed,JHS with SHS,202
994,Region I,La Union,Agoo East,600004,Don Mariano Marcos Memorial State University-S...,Consolacion,AGOO,2nd District,CONSOLACION (POB.),suc_luc,partially_urban,bSUanC Managed,All Offering (K to 12),8


In [27]:
root_df[root_df["school_subclassification"] == "bLoacnal Government"].sample(2)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
26491,Region VI,Capiz,Pilar,501100,Guise Integrated School,GUISE,PILAR,1st District,DULANGAN,public,partially_urban,bLoacnal Government,ES and JHS (K to 10),236
26522,Region VI,Capiz,President Rox,s115709,Bayuyan ES,"BAYUYAN, PRES. ROXAS",PRESIDENT ROXAS,1st District,BAYUYAN,public,partially_urban,bLoacnal Government,Purely ES,236


In [28]:
root_df[root_df["school_subclassification"] == "bDaOnST Managed"].sample(2)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
51566,CARAGA,Butuan City,Southeast I Bu,uan District305490,Philippine Science High School - Caraga Region...,"Tiniwisan, Butuan City",BUTUAN CITY (Capital),1st District,TINIWISAN,public,partially_urban,bDaOnST Managed,JHS with SHS,460
50573,Region XII,Koronadal City,Koronadal We,t District I330521,Philippine Science High School - SOCCSKSARGEN ...,Not Applicable,CITY OF KORONADAL (Capital),2nd District,PARAISO,public,partially_urban,bDaOnST Managed,JHS with SHS,451


In [29]:
root_df[root_df["school_subclassification"] == "bLoacnal International School"].sample(
    2
)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
21400,Region V,Camarines Norte,Labo East,409756,"ADR Bicol International Technological College,...",P-4,LABO,1st District,MALASUGUI,private,partially_urban,bLoacnal International School,Purely SHS,191
21401,Region V,Camarines Norte,Labo East,409757,"Camarines Norte International School, Inc.","Maharlika Highway, P-1",LABO,1st District,MASALONG,private,partially_urban,bLoacnal International School,Purely SHS,191


In [30]:
root_df[root_df["school_subclassification"] == "bOatnher GA Managed"].sample()

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
6048,Region II,Tuguegarao City,Tuguegarao,est Educatio1n0a0l 9Z9o4,eDepartment of Agriculture R02 Child Developme...,"Nursery Compound, San Gabriel, Tuguegarao City",TUGUEGARAO CITY(Capital),3rd District,SAN GABRIEL,public,partially_urban,bOatnher GA Managed,Purely ES,54


In [31]:
mapper = {
    "bDaenpED Managed": "deped_managed",
    "DepED Managed": "deped_managed",
    "Non-Sectarian": "non_sectarian",
    "bNaonn-Sectarian": "non_sectarian",
    "bSeacntarian": "sectarian",
    "Sectarian": "sectarian",
    "bSUanC Managed": "suc_managed",
    "bLoacnal Government": "local_government",
    "SUC Managed": "suc_managed",
    "Local Government": "local_government",
    "bLUanC": "luc",
    "LUC": "luc",
    "bDaOnST Managed": "dost_managed",
    "DOST Managed": "dost_managed",
    "bLoacnal International School": "local_international_school",
    "bOatnher GA Managed": "other_ga_managed",
    "Other GA Managed": "other_ga_managed",
}

# nice list


In [32]:
# now let's replace

for key, value in mapper.items():
    root_df["school_subclassification"] = root_df["school_subclassification"].replace(
        key, value
    )

In [33]:
root_df.sample(10)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
35912,Region VIII,Leyte,Merida,121725,Lundag Elementary School,"Brgy. Lundag, Merida, Leyte",MERIDA,4th District,LUNDAG,public,partially_urban,deped_managed,Purely ES,320
38868,Region VIII,Tacloban City,District Learni,g Center I124242,Tagpuro Elementary School,Tagpuro,TACLOBAN CITY (Capital),1st District,BARANGAY 108 (TAGAPURO),public,urban,deped_managed,Purely ES,347
35945,Region VIII,Leyte,Palo I,404693,Alpha-Omega Learning Center New Life Baptist C...,0507 San Salvador St. St. Michael,PALO,1st District,SAN MIGUEL (POB.),private,partially_urban,non_sectarian,Purely ES,320
38599,Region VIII,Calbayog City,Calbayog Distr,ct III124047,San Jose Elementary School,Purok-2,CALBAYOG CITY,1st District,SAN JOSE,public,partially_urban,deped_managed,Purely ES,344
32857,Region VII,Cebu City,South District,312515,Quiot High School,Sitio Bogo Quiot,CEBU CITY (Capital),2nd District,QUIOT PARDO,public,urban,deped_managed,Purely JHS,293
23618,Region V,Masbate,Placer East,113821,Ban-ao ES,BAN-AO,PLACER,3rd District,BAN-AO,public,partially_urban,deped_managed,Purely ES,210
19759,Region IV-B,Palawan,Taytay II,111206,Alacalian Elementary School,purok 1,TAYTAY,1st District,ALACALIAN,public,partially_urban,deped_managed,Purely ES,176
32262,Region VII,Cebu,Moalboal,119498,Moalboal Central ES,-Pob.West,MOALBOAL,2nd District,POBLACION WEST,public,partially_urban,deped_managed,Purely ES,288
14956,Region IV-A,Quezon,San Francisco,501752,Madagoldol Integrated School,none,SAN FRANCISCO (AURORA),3rd District,INABUAN,public,partially_urban,deped_managed,ES and JHS (K to 10),133
49445,Region XII,South Cotabato,Polomolok W,st468576,"Lapid Kinderland, Inc.",Purok Pag-asa,POLOMOLOK,1st District,MAGSAYSAY,private,partially_urban,non_sectarian,Purely ES,441


In [34]:
root_df["modified_cultural_offering_classification"].value_counts()

modified_cultural_offering_classification
Purely ES                 43765
JHS with SHS               7490
All Offering (K to 12)     3423
ES and JHS (K to 10)       3056
Purely JHS                 1787
Purely SHS                 1403
Name: count, dtype: int64

In [35]:
offering_mapper = {
    "Purely ES": "purely_es",
    "JHS with SHS": "jhs_with_shs",
    "All Offering (K to 12)": "all_offering",
    "ES and JHS (K to 10)": "es_and_jhs",
    "Purely JHS": "purely_jhs",
    "Purely SHS": "purely_shs",
}

In [36]:
root_df["modified_cultural_offering_classification"] = root_df["modified_cultural_offering_classification"].replace(offering_mapper)

In [37]:
root_df.sample(10)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page
25939,Region VI,Antique,San Remegio I,115262,Insubuan ES,"-san remigio, antique",SAN REMIGIO,Lone District,INSUBUAN,public,partially_urban,deped_managed,purely_es,231
4873,Region II,Isabela,Palanan,103544,Diddadungan Elementary School,"-Diddadungan,Palanan,Isabela",PALANAN,1st District,DIDDADUNGAN,public,partially_urban,deped_managed,purely_es,43
15976,Region IV-A,Batangas City,Batangas City,ast District 301483,Talumpok Integrated School,Talumpok,BATANGAS CITY (Capital),2nd District,TALUMPOK KANLURAN,public,partially_urban,deped_managed,jhs_with_shs,142
30480,Region VII,Bohol,Catigbian,118181,Rizal PS,Rizal,CATIGBIAN,1st District,RIZAL,public,partially_urban,deped_managed,purely_es,272
36009,Region VIII,Leyte,Palompon So,th121790,Canipaan Elementary School,,PALOMPON,4th District,CANIPAAN,public,partially_urban,deped_managed,purely_es,321
15546,Region IV-A,Rizal,San Mateo,308139,San Mateo National High School - Guinayang Annex,Jurado Comp. Brgy. Guinayang,SAN MATEO,2nd District,GUINAYANG,public,urban,deped_managed,purely_jhs,138
48969,Region XII,Sarangani,East Maitum,130531,Kipalkuda ES,"New La Union, Maitum, Sarangani",MAITUM,Lone District,NEW LA UNION,public,partially_urban,deped_managed,purely_es,437
4098,Region II,Cagayan,Sta. Praxedes,300489,Sta. Praxedes High School,Guerrero St.,SANTA PRAXEDES,2nd District,CENTRO I (POB.),public,partially_urban,deped_managed,jhs_with_shs,36
53,Region I,Ilocos Norte,Badoc,100040,Sta. Cruz ES,"Sta Cruz Norte, Badoc, Ilocos Norte",BADOC,2nd District,SANTA CRUZ SUR,public,partially_urban,deped_managed,purely_es,0
60539,NCR,Malabon City,Malabon Distr,ct II487531,"Academia De La Lilia, Inc.","Block 48 Lot 6 Lapu-Lapu Avenue, corner Hito S...",CITY OF MALABON,Lone District,LONGOS,private,urban,non_sectarian,purely_es,540


In [38]:
list_of_clean_dfs: List[pd.DataFrame] = []

In [39]:
root_df["beis_1"] = pd.to_numeric(root_df["beis_school_id"], errors="coerce").astype("Int32")

In [40]:
six_dig_and_not_null = (root_df["beis_1"].astype(str).str.len()==6) & (root_df["beis_1"].notna())
root_df[six_dig_and_not_null]

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page,beis_1
0,Region I,Ilocos Norte,Bacarra I,100001,Apaleng-Libtong ES,"Brgy. 21, Libtong, Bacarra, Ilocos Norte",BACARRA,1st District,LIBTONG,public,partially_urban,deped_managed,purely_es,0,100001
1,Region I,Ilocos Norte,Bacarra I,100002,Bacarra CES,Santa Rita,BACARRA,1st District,SANTA RITA (POB.),public,partially_urban,deped_managed,purely_es,0,100002
2,Region I,Ilocos Norte,Bacarra I,100003,Buyon ES,NONE,BACARRA,1st District,BUYON,public,partially_urban,deped_managed,purely_es,0,100003
3,Region I,Ilocos Norte,Bacarra I,100004,Ganagan Elementary School,"#37 Ganagan,Bacarra, Ilocos Norte",BACARRA,1st District,GANAGAN,public,partially_urban,deped_managed,purely_es,0,100004
4,Region I,Ilocos Norte,Bacarra I,100005,Macupit ES,Macupit,BACARRA,1st District,MACUPIT,public,partially_urban,deped_managed,purely_es,0,100005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60804,NCR,Taguig,Pateros,488005,SEP Christian School,431 A F. Imson St. San Pedro Pateros Metro Manila,PATEROS,1st District,SAN PEDRO,private,urban,non_sectarian,purely_es,542,488005
60805,NCR,Taguig,Pateros,488043,"Huckleberry Montessori School, Inc. Pateros (M...",A. Almeda St.,PATEROS,1st District,MAGTANGGOL,private,urban,non_sectarian,purely_es,542,488043
60806,NCR,Taguig,Pateros,488112,Maranatha Christian Academy of Tabacalera Pate...,101 F.C. Tuazon Street,PATEROS,1st District,TABACALERA,private,urban,non_sectarian,purely_es,542,488112
60807,NCR,Taguig,Pateros,488114,"ABC Educational Development Center, Inc.",29-A Almeda Street,PATEROS,1st District,MARTIRES DEL 96,private,urban,non_sectarian,purely_es,542,488114


In [41]:
list_of_clean_dfs.append(root_df[six_dig_and_not_null])

In [42]:
wdf = root_df[~six_dig_and_not_null]

In [43]:
wdf.sample(10)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page,beis_1
38728,Region VIII,Calbayog City,Tinambacan D,strict III501970,Caglanipao Sur Integrated School,Purok-2,CALBAYOG CITY,1st District,CAGLANIPAO SUR,public,partially_urban,deped_managed,es_and_jhs,345,
44524,Region X,Cagayan de Oro City,Cagayan de O,o City West 4II0 D5i2st3r5i,tMerry Child School,"Zone 7, Bulua, Cagayan de Oro City",CAGAYAN DE ORO CITY (Capital),1st District,BULUA,private,urban,non_sectarian,all_offering,397,
44521,Region X,Cagayan de Oro City,Cagayan de O,o City West 4II0 D5i2st0r9i,tDiamond Evangelical School Inc.,"Camp Evangelista Patag, Cagayan de Oro City",CAGAYAN DE ORO CITY (Capital),1st District,PATAG,private,urban,sectarian,purely_es,397,
50215,Region XII,Cotabato City,Cotabato City,istrict III304634,Notre Dame Village National High School,"San Herminigildo St. RH 8, Cotabato City",COTABATO CITY,1st District,ROSARY HEIGHTS VIII,public,urban,deped_managed,jhs_with_shs,448,
10895,Region III,San Jose del Monte City,San Jose Del,onte West401470,Spirit of Joy School,Main Rd. Cor. San Lorenzo Ruiz St. Pleasant Hi...,CITY OF SAN JOSE DEL MONTE,Lone District,SAN MANUEL,private,urban,non_sectarian,es_and_jhs,97,
47317,Region XI,Digos City,Digos Occiden,al316303,Balabag National High School,Balabag,CITY OF DIGOS (Capital),1st District,BALABAG,public,partially_urban,deped_managed,purely_jhs,422,
44667,Region X,Gingoog City,Gingoog City S,uth-2 Distr3ic0t4131,Gingoog City CNHS - BACKKISMI NHS Annex,Purok #1 Binakalan,GINGOOG CITY,1st District,BINAKALAN,public,partially_urban,deped_managed,jhs_with_shs,398,
36523,Region VIII,Southern Leyte,San Juan (Cab,lian)122249,Dayanog Elementary School,-Municipal road,SAN JUAN (CABALIAN),Lone District,DAYANOG,public,partially_urban,deped_managed,purely_es,326,
3174,Region I,Urdaneta City,Urdaneta City,istrict II500007,Catablan Integrated School,"Zone 3, Catablan, Urdaneta City, Pangasinan",CITY OF URDANETA,5th District,CATABLAN,public,urban,deped_managed,all_offering,28,
4479,Region II,Isabela,Angadanan W,st103068,Sinabbaran Elementary School,-PUROK 3,ANGADANAN,3rd District,SINABBARAN,public,partially_urban,deped_managed,purely_es,40,


In [44]:
#trial
wseries = wdf.loc[17099]

In [45]:
import re
re.findall(r"\d+", wseries["beis_school_id"])

['400804']

In [46]:
# now lets try that
wdf["beis_1"] = wdf["beis_school_id"].str.findall(r"\d+").str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf["beis_1"] = wdf["beis_school_id"].str.findall(r"\d+").str[0]


In [47]:
wdf.sample(10)

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page,beis_1
59804,NCR,Makati City,Makati City Di,trict V406857,4th Watch Maranatha Christian Academy of Makati,2121 Nuestra Señora St.,CITY OF MAKATI,2nd District,GUADALUPE NUEVO,private,urban,non_sectarian,all_offering,534,406857
54712,BARMM,Maguindanao I,Datu Abdullah,Sangki133874,Campo Cuatro ES,-Campo Cuatro,DATU ABDULLAH SANGKI,2nd District,TALISAWA,public,rural,deped_managed,purely_es,488,133874
16399,Region IV-A,Lucena City,Lucena West,istrict427511,"Growwe Learning Center, Inc.",1170 Garnet Street Iyam Lucena City,LUCENA CITY (Capital),2nd District,ILAYANG IYAM,private,urban,non_sectarian,purely_es,146,427511
40688,Region IX,Zamboanga del Sur,Ramon Magsa,say125322,Magsaysay ES,Purok 2,RAMON MAGSAYSAY (LIARGO),1st District,MAGSAYSAY,public,partially_urban,deped_managed,purely_es,363,125322
17041,Region IV-A,Tanauan City,Tanauan City,orth II321601,Tanauan City Integrated High School,"Trapiche, Tanauan City, Batangas",CITY OF TANAUAN,3rd District,TRAPICHE,public,urban,deped_managed,jhs_with_shs,152,321601
42158,Region X,Malaybalay City,Malaybalay Ci,y District VI 126564,Laguitas ES,"Purok 2, Laguitas, Malaybalay City",CITY MALAYBALAY (Capital),2nd District,LAGUITAS,public,partially_urban,deped_managed,purely_es,376,126564
60524,NCR,Malabon City,Malabon Distr,ct II320501,Longos National High School,"Maya-Maya St., Longos, Malabon City",CITY OF MALABON,Lone District,LONGOS,public,urban,deped_managed,purely_jhs,540,320501
57886,NCR,City of San Juan,San Juan Distr,ct I485503,Fountain International School,"14 Annapolis St., Greenhills",CITY OF SAN JUAN,Lone District,GREENHILLS,private,urban,non_sectarian,purely_es,516,485503
58341,NCR,Quezon City,School District,I406392,Sacred Heart Academy of La Loma,"49 N.S. Amoranto Sr. Ave., La Loma Quezon, City",QUEZON CITY,1st District,PAANG BUNDOK,private,urban,non_sectarian,all_offering,520,406392
18239,Region IV-B,Occidental Mindoro,Abra De Ilog-P,luan305854,Pambuhan Indigenous People Village High School,,ABRA DE ILOG,Lone District,SAN VICENTE,public,partially_urban,deped_managed,purely_jhs,162,305854


In [48]:
is_six_digit = (pd.to_numeric(wdf["beis_1"], errors="coerce").astype("Int32").astype(str).str.len()==6)

In [49]:
wdf[is_six_digit]

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,page,beis_1
87,Region I,Ilocos Norte,Banna (Espirit,)100058,Bangsar ES,Bangsar,BANNA (ESPIRITU),2nd District,BANGSAR,public,partially_urban,deped_managed,purely_es,0,100058
88,Region I,Ilocos Norte,Banna (Espirit,)100059,Banna Central Elementary School,"P. Gomez, Marcos, Banna, Ilocos Norte",BANNA (ESPIRITU),2nd District,MARCOS (POB.),public,partially_urban,deped_managed,purely_es,0,100059
89,Region I,Ilocos Norte,Banna (Espirit,)100060,Barbarangay ES,Barbarangay,BANNA (ESPIRITU),2nd District,BARBARANGAY,public,partially_urban,deped_managed,purely_es,0,100060
90,Region I,Ilocos Norte,Banna (Espirit,)100061,Bomitog ES,Banna - Pinili Rd.,BANNA (ESPIRITU),2nd District,BOMITOG,public,partially_urban,deped_managed,purely_es,0,100061
91,Region I,Ilocos Norte,Banna (Espirit,)100062,Bugasi ES,Bugasi,BANNA (ESPIRITU),2nd District,BUGASI,public,partially_urban,deped_managed,purely_es,0,100062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60919,NCR,Muntinlupa City,Muntinlupa Ci,y District II488533,"CBC Integrated School, Inc.","1560 Estanislao Street, Lakeview Homes I",CITY OF MUNTINLUPA,Lone District,PUTATAN,private,urban,non_sectarian,purely_es,543,488533
60920,NCR,Muntinlupa City,Muntinlupa Ci,y District II488544,"The Linden Tree Institute, Inc.",177 Buencamino Street,CITY OF MUNTINLUPA,Lone District,ALABANG,private,urban,non_sectarian,es_and_jhs,543,488544
60921,NCR,Muntinlupa City,Muntinlupa Ci,y District II488547,Cambridge Children's Learning and Development ...,Lower Ground Level Alabang Town Center,CITY OF MUNTINLUPA,Lone District,ALABANG,private,urban,sectarian,purely_es,543,488547
60922,NCR,Muntinlupa City,Muntinlupa Ci,y District II488548,Holy Word Christian School,"4 Cattleya St., Doña Rosario Bayview Subdivision",CITY OF MUNTINLUPA,Lone District,SUCAT,private,urban,sectarian,es_and_jhs,543,488548


In [50]:
list_of_clean_dfs.append(wdf[is_six_digit])

In [51]:
wwdf = wdf[~is_six_digit]

In [52]:
# wwdf["beis_1"] = 
wwdf["beis_1"] = pd.to_numeric(wwdf["beis_school_id"].str.findall(r"\d+").str.join(""), errors="coerce").astype("Int32").astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wwdf["beis_1"] = pd.to_numeric(wwdf["beis_school_id"].str.findall(r"\d+").str.join(""), errors="coerce").astype("Int32").astype(str)


In [53]:
nuther_six_dig = (wwdf["beis_1"].str.len()==6)

In [54]:
list_of_clean_dfs.append(wwdf[nuther_six_dig])

In [55]:
# Now's the hard part

In [56]:
rdf = wwdf[~nuther_six_dig]

In [57]:
sev_dig = (rdf["beis_1"].str.len()==7)

In [58]:
sev_df = rdf[sev_dig]

In [59]:
sev_df["beis_1"] = sev_df["beis_1"].str[1:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sev_df["beis_1"] = sev_df["beis_1"].str[1:]


In [60]:
list_of_clean_dfs.append(sev_df)

In [61]:
wherf = rdf[~sev_dig]

In [62]:
wherf["beis_1"] = wherf["beis_school_id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wherf["beis_1"] = wherf["beis_school_id"]


In [63]:
wherf["beis_1"] = wherf["beis_1"].astype(str).str[1:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wherf["beis_1"] = wherf["beis_1"].astype(str).str[1:]


In [64]:
list_of_clean_dfs.append(wherf)

In [65]:
raise KeyboardInterrupt

KeyboardInterrupt: 

In [68]:
mother_df: pd.DataFrame = pd.DataFrame()
for a_df  in list_of_clean_dfs:
    mother_df = pd.concat([mother_df, a_df])

In [70]:
mother_df["beis_school_id"] = mother_df["beis_1"]

In [75]:
mother_df["beis_school_id"].astype(str).str.len().value_counts()

beis_school_id
6    60924
Name: count, dtype: int64

In [78]:
mother_df["page"] = mother_df["page"] + 1

In [80]:
mother_df = mother_df.rename({"page":"masterlist_page"}, axis=1)

In [84]:
mother_df = mother_df.drop("beis_1", axis=1)

In [85]:
mother_df

Unnamed: 0,region,division,district,beis_school_id,school_name,street_address,municipality,legislative_district,barangay,sector,settlement_type,school_subclassification,modified_cultural_offering_classification,masterlist_page
0,Region I,Ilocos Norte,Bacarra I,100001,Apaleng-Libtong ES,"Brgy. 21, Libtong, Bacarra, Ilocos Norte",BACARRA,1st District,LIBTONG,public,partially_urban,deped_managed,purely_es,1
1,Region I,Ilocos Norte,Bacarra I,100002,Bacarra CES,Santa Rita,BACARRA,1st District,SANTA RITA (POB.),public,partially_urban,deped_managed,purely_es,1
2,Region I,Ilocos Norte,Bacarra I,100003,Buyon ES,NONE,BACARRA,1st District,BUYON,public,partially_urban,deped_managed,purely_es,1
3,Region I,Ilocos Norte,Bacarra I,100004,Ganagan Elementary School,"#37 Ganagan,Bacarra, Ilocos Norte",BACARRA,1st District,GANAGAN,public,partially_urban,deped_managed,purely_es,1
4,Region I,Ilocos Norte,Bacarra I,100005,Macupit ES,Macupit,BACARRA,1st District,MACUPIT,public,partially_urban,deped_managed,purely_es,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32715,Region VII,Cebu,Pinamungajan,119572,Tajao Central School,South,PINAMUNGAHAN,3rd District,TAJAO,public,partially_urban,deped_managed,purely_es,293
32716,Region VII,Cebu,Pinamungajan,119574,Tanibag ES,"Tanibag, Pinamungajan, Cebu",PINAMUNGAHAN,3rd District,TANIBAG,public,partially_urban,deped_managed,purely_es,293
32717,Region VII,Cebu,Pinamungajan,187011,Buhingtubig ES,"Buhingtubig, Pinamungajan",PINAMUNGAHAN,3rd District,BUHINGTUBIG,public,partially_urban,deped_managed,purely_es,293
32718,Region VII,Cebu,Pinamungajan,187012,Cabiangon ES,"Cabiangon, Pinamungajan",PINAMUNGAHAN,3rd District,CABIANGON,public,partially_urban,deped_managed,purely_es,293


In [87]:
mother_df["beis_school_id"] = mother_df["beis_school_id"].astype(str)

In [91]:
mother_df.to_parquet("../data/education/basic_education_institutions.parquet", index=False)

In [90]:
mother_df.to_csv("../data/education/basic_education_institutions.csv", index=False)

In [None]:
isnumeric_filter.value_counts(dropna=False)

AttributeError: 'function' object has no attribute 'value_counts'

In [None]:
import numpy as np

In [None]:
root_df["beis_school_id"].sample(10)

63                     123186
64                     109901
107                    104802
25                     300313
79                alona117028
13     Central Dis3tr2ic0t404
72                     106473
33                     135139
23                     342346
95               ict II406890
Name: beis_school_id, dtype: object

In [None]:
root_df["numeric_beis"] = isnumeric_filter

In [None]:
root_df[root_df["numeric_beis"]]

ValueError: Cannot mask with non-boolean array containing NA / NaN values