In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for CASURECO II's webpage.

In [5]:
import requests

url = 'https://www.casureco2.com.ph/support/rates?page=1'
response = requests.get(url)

if response.status_code == 200:
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(response.text)
    print("Webpage downloaded and saved as 'generation_charge.html'.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Webpage downloaded and saved as 'generation_charge.html'.


In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import os
import time

# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get('https://www.casureco2.com.ph/support/rates?page=1')

# Create a directory for storing PDFs if it doesn't exist
download_dir = 'pdf downloads'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to download PDF
def download_pdf(pdf_url, save_path):
    response = requests.get(pdf_url)
    with open(save_path, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded: {save_path}")

# Function to process the current page
def process_current_page():
    # Wait for the table to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'biddings_table')))
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', {'id': 'biddings_table'})
    rows = table.find('tbody').find_all('tr')

    for row in rows:
        # Find the "View PDF" button and click it
        view_pdf_button = row.find('button', {'data-dropdown-toggle': True})
        if view_pdf_button:
            view_pdf_button_id = view_pdf_button['id']
            driver.find_element(By.ID, view_pdf_button_id).click()

            # Wait for the dropdown to be visible
            dropdown_id = f'dropdown{view_pdf_button_id.split("dropdownDefaultButton")[-1]}'
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, dropdown_id)))

            # Get the "Generation Charge" link and download the PDF if not disabled
            dropdown = driver.find_element(By.ID, dropdown_id)
            gen_charge_link_element = dropdown.find_element(By.LINK_TEXT, 'Generation Charge')
            gen_charge_link = gen_charge_link_element.get_attribute('href')
            
            # Extract month and year
            month = row.find_all('th')[0].text.strip()
            year = row.find_all('th')[1].text.strip()

            # Print the contents of gen_charge_link and its attributes
            print(f"Link href: {gen_charge_link}")
            print(f"Link attributes: {gen_charge_link_element.get_attribute('outerHTML')}")
            
            if 'disabled' not in gen_charge_link_element.get_attribute('outerHTML'):
                save_path = os.path.join(download_dir, f'{month}_{year}_Generation_Charge.pdf')
                download_pdf(gen_charge_link, save_path)
            else:
                print(f"Link for {month} {year} is disabled, skipping download.")

            # Close the dropdown by clicking again on "View PDF"
            driver.find_element(By.ID, view_pdf_button_id).click()

# Function to check if we are on the last page
def is_last_page():
    try:
        results_text = driver.find_element(By.XPATH, '//p[@class="text-sm text-gray-700 dark:text-white leading-5"]').text
        parts = results_text.split()
        showing_to = int(parts[3])  # This should be the second number in the text "Showing X to Y of Z results"
        total = int(parts[5])       # This should be the total number of results
        return showing_to == total
    except Exception as e:
        print(f"Error checking last page: {e}")
        return False

# Loop through pages
while True:
    process_current_page()
    
    # Check if there is a next page
    if is_last_page():
        break
    
    try:
        next_button = driver.find_element(By.XPATH, '/html/body/div/div[1]/main/section/div[2]/div/div[2]/nav/div[2]/div[2]/span/a[2]')
        next_button.click()
        time.sleep(2)  # Wait for the next page to load
    except Exception as e:
        print(f"An error occurred: {e}")
        break

# Close the driver
driver.quit()

Link href: https://www.casureco2.com.ph/public/public/rates_images/JULY%20GEN-MIX%20-%20Copy_1721352610.pdf
Link attributes: <a href="https://www.casureco2.com.ph/public/public/rates_images/JULY GEN-MIX - Copy_1721352610.pdf" class="block px-4 py-2 hover:bg-gray-100 dark:hover:bg-gray-600 dark:hover:text-white">
                                                                Generation Charge
                                                            </a>
Downloaded: pdf downloads/July_2024_Generation_Charge.pdf
Link href: https://www.casureco2.com.ph/public/public/rates_images/JUNE%20GENERATION%20CHARGE_1718848441.pdf
Link attributes: <a href="https://www.casureco2.com.ph/public/public/rates_images/JUNE GENERATION CHARGE_1718848441.pdf" class="block px-4 py-2 hover:bg-gray-100 dark:hover:bg-gray-600 dark:hover:text-white">
                                                                Generation Charge
                                                            </a>
Downloaded: pdf 

### **Processing**

In [58]:
pdf_dir = "pdf downloads"

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each PDF file in the directory
for file_name in os.listdir(pdf_dir):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(pdf_dir, file_name)
        
        # Read the PDF file
        df = tabula.read_pdf(file_path, stream=True, pages=1)[0]

        # Extract date from file name (e.g., "April_2024_Generation_Charge.pdf" -> "Apr-2024")
        match = re.match(r"(\w+)_(\d{4})_Generation_Charge\.pdf", file_name)
        if match:
            month, year = match.groups()
            date_str = pd.to_datetime(f"{month} 1, {year}").strftime("%b-%Y")
        else:
            date_str = None
        
        # Define the keywords to search for in row values
        keywords = ['Suppliers', ',', '.', '%']

        # Initialize a list to store the columns to keep
        columns_to_keep = []

        # First column to keep: Find the column containing the word 'Suppliers'
        suppliers_col = df.columns[df.apply(lambda col: col.astype(str).str.contains('Suppliers', case=False, na=False)).any()]
        if not suppliers_col.empty:
            suppliers_col_name = suppliers_col[0]
            columns_to_keep.append(suppliers_col_name)

            # Second column to keep: Find the first column with a comma, excluding the 'Suppliers' column
            comma_col = df.columns[
                df.apply(lambda col: col.astype(str).str.contains(',', case=False, na=False)).any()
            ]
            if not comma_col.empty:
                comma_col_name = comma_col[0]
                
                if comma_col_name != suppliers_col_name:
                    columns_to_keep.append(comma_col_name)
                else:
                    comma_col_name = comma_col[1]
                    columns_to_keep.append(comma_col_name)

            # Third column to keep: Find the column with a percentage symbol, excluding the previous columns
            percent_cols = df.columns[
                df.apply(lambda col: col.astype(str).str.contains('%', case=False, na=False)).any()
            ]
            
            if not percent_cols.empty:
                percent_col_name = percent_cols[0]
                if percent_col_name not in columns_to_keep:
                    columns_to_keep.append(percent_col_name)

            # Fourth column to keep: Find the last column with a period, excluding the previous columns
            period_cols = df.columns[
                df.apply(lambda col: col.astype(str).str.contains('.', case=False, na=False)).any()
            ]
            
            if not period_cols.empty:
                period_cols = [col for col in period_cols if col not in columns_to_keep]
                if period_cols:
                    last_period_col = period_cols[-1]
                    if last_period_col not in columns_to_keep:
                        columns_to_keep.append(last_period_col)

        # Drop duplicates to ensure unique columns
        columns_to_keep = list(dict.fromkeys(columns_to_keep))

        # Slice the DataFrame to keep only the relevant columns
        df_sliced_columns = df[columns_to_keep]

        # Reorder columns to ensure 'Energy Share' is the third column
        if 'Energy Share' in df_sliced_columns.columns:
            columns_order = [col for col in columns_to_keep if col != 'Energy Share'] + ['Energy Share']
            df_sliced_columns = df_sliced_columns[columns_order]

        # Rename columns for consistency if the length matches
        if len(df_sliced_columns.columns) == 4:
            df_sliced_columns.columns = ["Power Supplier", "kWh", "Energy Share", "Average Generation Cost"]

        # Find the index of rows containing specific keywords in the first column
        suppliers_index = df_sliced_columns.index[df_sliced_columns.iloc[:, 0].str.contains('Suppliers', case=False, na=False)]
        sub_total_index = df_sliced_columns.index[df_sliced_columns.iloc[:, 0].str.contains('Sub-Total', case=False, na=False)]
        
        # Ensure we have found these rows
        if not suppliers_index.empty and not sub_total_index.empty:
            suppliers_index = suppliers_index[0]
            sub_total_index = sub_total_index[0]

            # Slice the DataFrame from after "Suppliers" row to "Sub-Total" row (inclusive)
            df_filtered = df_sliced_columns.loc[suppliers_index + 1: sub_total_index]

            # Add the extracted date to the DataFrame
            df_filtered['Date'] = date_str

            # Append the filtered DataFrame to the list
            dataframes.append(df_filtered)

# Concatenate all DataFrames in the list into a single DataFrame
big_df = pd.concat(dataframes, ignore_index=True)

big_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Date'] = date_str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Date'] = date_str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Date'] = date_str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Unnamed: 0,Power Supplier,kWh,Energy Share,Average Generation Cost,Date
0,PSALM,4052294,9%,6.277,Jun-2024
1,Net Metering,8861,0.02%,6.7472,Jun-2024
2,Wholesale Electricity Spot Market (WESM),43554260,91%,2.0362,Jun-2024
3,Sub-Total,47615415,100%,,Jun-2024
4,MPPCL-AES,429000,1%,2.8138,May-2024
5,PSALM,4556654,10%,5.8167,May-2024
6,Net Metering,10075,0.02%,4.9881,May-2024
7,Wholesale Electricity Spot Market (WESM),39382800,89%,6.7289,May-2024
8,Sub-Total,44378529,100%,,May-2024
9,MPPCL-AES,11221550,31%,3.151,Apr-2024


#### Creating Supplier Dataframe

In [60]:
unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers

array(['PSALM', 'Net Metering',
       'Wholesale Electricity Spot Market (WESM)', 'Sub-Total',
       'MPPCL-AES'], dtype=object)

In [61]:
# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Supplier', 'Power Supplier ID'])

supplier_df

Unnamed: 0,Power Supplier,Power Supplier ID
0,PSALM,1
1,Net Metering,2
2,Wholesale Electricity Spot Market (WESM),3
3,Sub-Total,4
4,MPPCL-AES,5


In [62]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Supplier'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Ensure IDs are not converted to floats
#big_df['Power Supplier ID'] = big_df['Power Supplier ID'].astype(int)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Supplier'])

big_df.head()

Unnamed: 0,kWh,Energy Share,Average Generation Cost,Date,Power Supplier ID
0,4052294,9%,6.277,Jun-2024,1
1,8861,0.02%,6.7472,Jun-2024,2
2,43554260,91%,2.0362,Jun-2024,3
3,47615415,100%,,Jun-2024,4
4,429000,1%,2.8138,May-2024,5


In [63]:
with pd.ExcelWriter("Historical_CASURECO_II_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

#### for troubleshooting

In [49]:
df = tabula.read_pdf("pdf downloads/June_2024_Generation_Charge.pdf", stream=True, pages=1)[0]

In [50]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,(A),(B),(C ),(D),E (E= C + D),F (F=E/A)
0,,,,,,Other,Total Generation Cost,
1,,,,Energy Share,Basic Generation Cost,,,Generation Rate
2,,Source,kWh Purchased,,,Charges/Adjustments,for the month,
3,,,,(%),(Php),,,(Php/kWh)
4,,,,,,(Php),(Php),
5,Power Suppliers /IPPs,,,,,,,
6,PSALM,,4052294,9%,25336902.74,99281.21,25436183.95,6.2770
7,Net Metering,,8861,0.02%,59786.94,,59786.94,6.7472
8,Wholesale Electricity Spot Market (WESM),,43554260,91%,88610362,74068,88684429.46,2.0362
9,Sub-Total,,47615415,100%,114007051.59,173348.76,114180400.35,


In [51]:
# Define the keywords to search for in row values
keywords = ['Suppliers', ',', '.', '%']

# Initialize a list to store the columns to keep
columns_to_keep = []

# First column to keep: Find the column containing the word 'Suppliers'
suppliers_col = df.columns[df.apply(lambda col: col.astype(str).str.contains('Suppliers', case=False, na=False)).any()]
if not suppliers_col.empty:
    suppliers_col_name = suppliers_col[0]
    columns_to_keep.append(suppliers_col_name)

    # Second column to keep: Find the first column with a comma, excluding the 'Suppliers' column
    comma_col = df.columns[
        df.apply(lambda col: col.astype(str).str.contains(',', case=False, na=False)).any()
    ]
    if not comma_col.empty:
        comma_col_name = comma_col[0]
        
        if comma_col_name != suppliers_col_name:
            columns_to_keep.append(comma_col_name)
        else:
            comma_col_name = comma_col[1]
            columns_to_keep.append(comma_col_name)

        # Third column to keep: Find the column with a percentage symbol, excluding the previous columns
        percent_cols = df.columns[
            df.apply(lambda col: col.astype(str).str.contains('%', case=False, na=False)).any()
        ]
        
        if not percent_cols.empty:
            percent_col_name = percent_cols[0]
            if percent_col_name not in columns_to_keep:
                columns_to_keep.append(percent_col_name)

        # Fourth column to keep: Find the last column with a period, excluding the previous columns
        period_cols = df.columns[
            df.apply(lambda col: col.astype(str).str.contains('.', case=False, na=False)).any()
        ]
        
        if not period_cols.empty:
            period_cols = [col for col in period_cols if col not in columns_to_keep]
            if period_cols:
                last_period_col = period_cols[-1]
                if last_period_col not in columns_to_keep:
                    columns_to_keep.append(last_period_col)

# Drop duplicates to ensure unique columns
columns_to_keep = list(dict.fromkeys(columns_to_keep))

# Slice the DataFrame to keep only the relevant columns
df_sliced_columns = df[columns_to_keep]

# Reorder columns to ensure 'Energy Share' is the third column
if 'Energy Share' in df_sliced_columns.columns:
    columns_order = [col for col in columns_to_keep if col != 'Energy Share'] + ['Energy Share']
    df_sliced_columns = df_sliced_columns[columns_order]

# Rename columns for consistency if the length matches
if len(df_sliced_columns.columns) == 4:
    df_sliced_columns.columns = ["Power Supplier", "kWh", "Energy Share", "Average Generation Cost"]

df_sliced_columns

Unnamed: 0,Power Supplier,kWh,Energy Share,Average Generation Cost
0,,,,
1,,,Energy Share,Generation Rate
2,,kWh Purchased,,
3,,,(%),(Php/kWh)
4,,,,
5,Power Suppliers /IPPs,,,
6,PSALM,4052294,9%,6.2770
7,Net Metering,8861,0.02%,6.7472
8,Wholesale Electricity Spot Market (WESM),43554260,91%,2.0362
9,Sub-Total,47615415,100%,


In [52]:
# Sample DataFrame for demonstration (replace this with your actual DataFrame)
# df_sliced_columns = pd.read_csv('your_data.csv')  # Load your actual DataFrame

# Find the index of rows containing specific keywords in the first column
suppliers_index = df_sliced_columns.index[df_sliced_columns.iloc[:, 0].str.contains('Suppliers', case=False, na=False)]
sub_total_index = df_sliced_columns.index[df_sliced_columns.iloc[:, 0].str.contains('Sub-Total', case=False, na=False)]
#total_generation_cost_index = df_sliced_columns.index[df_sliced_columns.iloc[:, 0].str.contains('TOTAL GENERATION COST', case=False, na=False)]

# Ensure we have found these rows
if not suppliers_index.empty and not sub_total_index.empty:
    suppliers_index = suppliers_index[0]
    sub_total_index = sub_total_index[0]

    # Slice the DataFrame from after "Suppliers" row to "Sub-Total" row (inclusive)
    df_filtered = df_sliced_columns.loc[suppliers_index + 1: sub_total_index]

df_filtered

Unnamed: 0,Power Supplier,kWh,Energy Share,Average Generation Cost
6,PSALM,4052294,9%,6.277
7,Net Metering,8861,0.02%,6.7472
8,Wholesale Electricity Spot Market (WESM),43554260,91%,2.0362
9,Sub-Total,47615415,100%,


In [56]:
# Ensure that df_filtered is a copy to avoid SettingWithCopyWarning
df_filtered = df_filtered.copy()

# Find the row with 'TOTAL' and extract the value from 'Column Average'
total_row = df_sliced_columns[df_sliced_columns['Power Supplier'].str.contains('TOTAL GENERATION COST', case=False, na=False)]
generation_charge = total_row['Average Generation Cost'].values[0] if not total_row.empty else None

# Add the new column "Generation Charge" with the extracted value
df_filtered.loc[:, 'Generation Charge'] = generation_charge

# Reset the index of the final dataframe
df_filtered.reset_index(drop=True, inplace=True)

df_filtered

Unnamed: 0,Power Supplier,kWh,Energy Share,Average Generation Cost,Generation Charge
0,PSALM,4052294,9%,6.277,2 .7423
1,Net Metering,8861,0.02%,6.7472,2 .7423
2,Wholesale Electricity Spot Market (WESM),43554260,91%,2.0362,2 .7423
3,Sub-Total,47615415,100%,,2 .7423
