### IMPORT PACKAGE

In [114]:
# import packages pandas and numpy
import pandas as pd
import numpy as np
import re

### EXTRACT DATA

In [115]:
# function to extract excel file as dataframe
def extract(file_path, sheet, rows, ind_col):
    
    # read the file into memory
    data = pd.read_excel(file_path, sheet_name = sheet, nrows = rows, index_col = ind_col)

    # excel to df
    data = pd.DataFrame(data)
    
    # printing details about the file
    print(f"data store in [{file_path}]:")
    print(f"\nNumber of rows [{data.shape[0]}], and Number of columns [{data.shape[1]}] in dataframe")
    print(f"\nColumns in dataframe with it's data types: ")
    
    # print data types
    print(data.dtypes)
    
    print(f"\n Printing the count value of NULL per column\n")
    print(data.isna().sum())
    
    # print message before returning the dataframe
    print(f"\nTo view the dataframe extracted from {file_path}, display the value returned by this function!\n\n") 
    
    return data

In [116]:
# call the function
psgc = extract("datasets\\PSGC-4Q-2023-Publication-Datafile.xlsx",
               sheet = 3,
               rows = None,
               ind_col = None )

data store in [datasets\PSGC-4Q-2023-Publication-Datafile.xlsx]:

Number of rows [43762], and Number of columns [13] in dataframe

Columns in dataframe with it's data types: 
10-digit PSGC                         float64
Name                                   object
Correspondence Code                   float64
Geographic Level                       object
Old names                              object
City Class                             object
Income\nClassification                 object
Urban / Rural\n(based on 2020 CPH)     object
2015 Population                        object
Unnamed: 9                             object
2020 Population                        object
Unnamed: 11                            object
Status                                 object
dtype: object

 Printing the count value of NULL per column

10-digit PSGC                             4
Name                                      0
Correspondence Code                      38
Geographic Level                  

### TRANSFORMING DATA

In [117]:
# renaming and cleaning the columns/headers 
def transform(data):
    
    # changing columns/headers string to lowercase, replacing special characters to spaces
    clean_columns = data.columns.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '')
    
    # passing function as clean_columns to dataframe columns
    data.columns = clean_columns
    
    # return to function
    return data

In [118]:
# Transformation
psgc = transform(psgc)

psgc

Unnamed: 0,10-digit psgc,name,correspondence code,geographic level,old names,city class,income\nclassification,urban / rural\n(based on 2020 cph),2015 population,unnamed: 9,2020 population,unnamed: 11,status
0,1.000000e+08,Region I (Ilocos Region),10000000.0,Reg,,,,,5026128,,5301139,,
1,1.028000e+08,Ilocos Norte,12800000.0,Prov,,,1st,,593081,,609588,,
2,1.028010e+08,Adams,12801000.0,Mun,,,5th,,1792,,2189,,
3,1.028010e+08,Adams,12801001.0,Bgy,,,,R,1792,,2189,,Pob.
4,1.028020e+08,Bacarra,12802000.0,Mun,,,3rd,,32215,,33496,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43757,1.999908e+09,Lagunde,124712030.0,Bgy,,,,U,3701,,5332,,
43758,1.999908e+09,Macasendeg,124712035.0,Bgy,,,,R,3775,,2516,,
43759,1.999908e+09,Manaulanan,124712037.0,Bgy,,,,U,7477,,7632,,
43760,1.999908e+09,Pamalian,124712062.0,Bgy,,,,R,2982,,3256,,


In [119]:
def filter_cols(data, column_name, level):
    
    if column_name not in data.columns:
        print(f"Column '{column_name}' not found in DataFrame.")
        return None
    
    data = data[data[column_name] == level][['10digitpsgc', 'name', 'correspondencecode', 'geographiclevel',
       'oldnames', 'cityclass', 'incomeclassification',
       'urbanruralbasedon2020cph', '2015population',
       '2020population','status']]
    
    return data

In [120]:
columns = psgc.columns

pd.DataFrame(columns)


Unnamed: 0,0
0,10-digit psgc
1,name
2,correspondence code
3,geographic level
4,old names
5,city class
6,income\nclassification
7,urban / rural\n(based on 2020 cph)
8,2015 population
9,unnamed: 9


In [121]:
# Records per Regional
regional = filter_cols(psgc, 'geographiclevel', "Reg")

# Regional Output
regional

Column 'geographiclevel' not found in DataFrame.


In [122]:
# Records per Regional
provincial = filter_cols(psgc, 'geographiclevel', "Prov")

# Provincial Output
provincial

Column 'geographiclevel' not found in DataFrame.


In [123]:
# Records per Municipal
municipal = filter_cols(psgc, 'geographiclevel', "Mun")

# Municipal Output
municipal

Column 'geographiclevel' not found in DataFrame.


In [124]:
# Records per City
city = filter_cols(psgc, 'geographiclevel', "City")

# City Output
city

Column 'geographiclevel' not found in DataFrame.


In [125]:
# Records per Baranggay
baranggay = filter_cols(psgc, 'geographiclevel', "Bgy")

# Count Records of Baranggay
print(f"Baranggay counts are: {baranggay.shape[0]}\n")

# Count based on urbanruralbasedon2020cph
brgy_ur_count = baranggay.groupby(['urbanruralbasedon2020cph'])['urbanruralbasedon2020cph'].count()
print("Count of Urban/ Rural/ Null:")
print(f"{brgy_ur_count.to_string(header=False)}\n")


# Baranggay Output
baranggay

Column 'geographiclevel' not found in DataFrame.


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
# Records per NCR Districts
ncr_districts = filter_cols(psgc, 'geographiclevel', "Dist")

# Count of Districts NCR
print(f"NCR Disctricts counts are: {ncr_districts.shape[0]}\n")

# NCR Districts Output
ncr_districts

In [None]:
def info_details(source):
    
    data = source
    print(f"There are {data.shape[0]} records\n")
    
    ur_count = data.groupby(['urbanruralbasedon2020cph'])['urbanruralbasedon2020cph'].count()
    print("Count of Urban/Rural based on 2020 CPH:")
    print(f"{ur_count.to_string(index=True, header=False)}\n")
    
    status_count = data.groupby(['status'])['status'].count()
    print("Count of Status:")
    print(f"{status_count.to_string(index=True, header=False)}\n")
    
    class_count = data.groupby(['incomeclassification'])['incomeclassification'].count()
    print("Count of Income Classification:")
    print(f"{class_count.to_string(index=True, header=False)}\n")     
    
    return data

In [None]:
# Urban Baranggay Based on 2020 CPH
urban = filter_cols(psgc, 'urbanruralbasedon2020cph', "U")

# Display rural dataframe
urban

In [None]:
# Rural Baranggay Based on 2020 CPH
rural = filter_cols(psgc, 'urbanruralbasedon2020cph', "R")

# Display rural dataframe
rural

In [None]:
# CC (Components Cities) based on City Class
cc = filter_cols(psgc, 'cityclass', 'CC')

# NCR Districts Output
cc

In [None]:
# ICC (Independent Components Cities) based on City Class
icc = filter_cols(psgc, 'cityclass', 'ICC')

# ICC
icc

In [None]:
# HUC (Highly Urbanized Cities) based on City Class
# Population: Minimum 200,000
# Annual Income: Php. 50M
huc = filter_cols(psgc, 'cityclass', 'HUC')

# HUC
huc

In [126]:
brgy_details = info_details(baranggay)

brgy_details

NameError: name 'info_details' is not defined