In [11]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [12]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [13]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [7]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for PELCO I's webpage.

In [20]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import requests

# Folder to store the downloaded files
download_folder = "pdf downloads"
os.makedirs(download_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Function to extract year-month codes from the dynamically loaded HTML
def extract_year_month_codes(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.get(url)
    driver.implicitly_wait(10)  # Wait for elements to load

    year_month_codes = []

    # Locate rows with "Generation Rates"
    try:
        rows = driver.find_elements(By.CSS_SELECTOR, 'div.w-full.grid.grid-cols-5.border-b.border-gray-200')
        for row in rows:
            headers = row.find_elements(By.CSS_SELECTOR, 'div.flex.items-center.justify-start.px-4.py-2')
            if len(headers) >= 3:
                # Extract text values
                month_text = headers[0].find_element(By.TAG_NAME, 'h6').text.strip()
                year_text = headers[1].find_element(By.TAG_NAME, 'h6').text.strip()
                rate_text = headers[2].find_element(By.TAG_NAME, 'h6').text.strip()

                if "Generation Rates" in rate_text:
                    month_map = {
                        "January": "01", "February": "02", "March": "03",
                        "April": "04", "May": "05", "June": "06",
                        "July": "07", "August": "08", "September": "09",
                        "October": "10", "November": "11", "December": "12"
                    }
                    if month_text in month_map:
                        year_month_codes.append(f"{year_text}{month_map[month_text]}")
    except Exception as e:
        print(f"An error occurred: {e}")

    driver.quit()
    return year_month_codes

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    if response.status_code == 200:
        file_path = os.path.join(download_folder, file_name)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {file_name}")
    else:
        print(f"Failed to download: {file_name}")

# Main
rates_url = "https://www.pelco1.org.ph/rates"
base_url = "https://firebasestorage.googleapis.com/v0/b/pelco1-cms.appspot.com/o/rates%2Fgen_"
file_extension = ".pdf"

# Extract year-month codes and download files
year_month_list = extract_year_month_codes(rates_url)
for ym in year_month_list:
    file_name = f"gen_{ym}.pdf"
    url = f"{base_url}{ym}{file_extension}?alt=media"
    download_file(url, file_name)

Downloaded: gen_202401.pdf
Downloaded: gen_202402.pdf
Downloaded: gen_202403.pdf
Downloaded: gen_202404.pdf
Downloaded: gen_202405.pdf
Downloaded: gen_202406.pdf
Downloaded: gen_202407.pdf
Failed to download: gen_202408.pdf
Downloaded: gen_202301.pdf
Downloaded: gen_202302.pdf
Downloaded: gen_202303.pdf
Downloaded: gen_202304.pdf
Downloaded: gen_202305.pdf
Downloaded: gen_202306.pdf
Downloaded: gen_202307.pdf
Downloaded: gen_202308.pdf
Downloaded: gen_202309.pdf
Downloaded: gen_202310.pdf
Downloaded: gen_202311.pdf
Downloaded: gen_202312.pdf
Downloaded: gen_202201.pdf
Downloaded: gen_202202.pdf
Downloaded: gen_202203.pdf
Downloaded: gen_202204.pdf
Downloaded: gen_202205.pdf
Downloaded: gen_202206.pdf
Downloaded: gen_202207.pdf
Downloaded: gen_202208.pdf
Downloaded: gen_202209.pdf
Downloaded: gen_202210.pdf
Downloaded: gen_202211.pdf
Downloaded: gen_202212.pdf
Downloaded: gen_202101.pdf
Downloaded: gen_202102.pdf
Downloaded: gen_202103.pdf
Downloaded: gen_202104.pdf
Downloaded: gen_2021

### **Processing**

In [38]:
df = tabula.read_pdf("pdf downloads/gen_202207.pdf", lattice=True, pages=1)[0]

In [39]:
df

Unnamed: 0,Source,% to Total\rkWH\rPurchased,( A)\rkWh Purchased,(B)\rBasic Generation\rCost (Php),( C )\r\rOther Cost\rAdjustments\r(Php),( D = B + C )\rTotal Generation\rCost for the month\r(Php),(D/A)\rAverage\rGeneration Cost\r(Php/Kwh)
0,NPC-TSC,,,,,,
1,BILATERAL CONTRACTS with IPPs,,,,,,
2,1.Masinloc Power Partners Co. Ltd. (MPPCL),71%,24919598.0,193323615.52,,193323615.52,7.7579
3,"2.Bac-Man Geothermal, Inc. (BGI)",29%,10809007.0,56183055.27,,56183055.27,5.1978
4,Others (PCR & Other Adj.),,,,,,0.0002
5,,,,,,,
6,TOTAL,100%,35728605.0,249506670.79,,249506670.79,6.9836


In [41]:
# Identify the column with a row containing "CONTRACTS"
contract_column = df.columns[df.apply(lambda col: col.astype(str).str.contains('CONTRACTS')).any()].tolist()

# Identify the column with "(A)" in the column header after replacing spaces
a_column = [col for col in df.columns if '(A)' in col.replace(' ', '')]

# Identify the column with "Average" in the column header
average_column = [col for col in df.columns if 'Average' in col]

columns_to_keep = contract_column + a_column + average_column

df_sliced_columns = df[columns_to_keep]

df_sliced_columns

Unnamed: 0,Source,( A)\rkWh Purchased,(D/A)\rAverage\rGeneration Cost\r(Php/Kwh)
0,NPC-TSC,,
1,BILATERAL CONTRACTS with IPPs,,
2,1.Masinloc Power Partners Co. Ltd. (MPPCL),24919598.0,7.7579
3,"2.Bac-Man Geothermal, Inc. (BGI)",10809007.0,5.1978
4,Others (PCR & Other Adj.),,0.0002
5,,,
6,TOTAL,35728605.0,6.9836


In [44]:
# Function to check for leading number or presence of "WESM" or "TOTAL"
def row_filter(row):
    return any(row.astype(str).str.contains(r'^\d+|WESM|TOTAL', regex=True))

# Filter rows based on the condition
filtered_rows = df_sliced_columns.apply(row_filter, axis=1)

# Slice the dataframe to keep only the filtered rows
df_sliced = df_sliced_columns[filtered_rows]

df_sliced.columns = ["Power Supplier", "kWh","Average Generation Cost"]
df_sliced

Unnamed: 0,Power Supplier,kWh,Average Generation Cost
2,1.Masinloc Power Partners Co. Ltd. (MPPCL),24919598.0,7.7579
3,"2.Bac-Man Geothermal, Inc. (BGI)",10809007.0,5.1978
4,Others (PCR & Other Adj.),,0.0002
6,TOTAL,35728605.0,6.9836
