In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

In [8]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

url = "https://morepower.com.ph/monthly-rates/"
pdf_folder = "pdf downloads"

# Create the folder if it doesn't exist
os.makedirs(pdf_folder, exist_ok=True)

# Define headers to include in the request
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send a GET request to the webpage with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links that end with .pdf
    pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
    
    # Download each PDF
    for link in pdf_links:
        pdf_url = urljoin(url, link)  # Create the full URL if it's relative
        pdf_filename = os.path.join(pdf_folder, os.path.basename(pdf_url))
        
        # Send a GET request to the PDF URL
        pdf_response = requests.get(pdf_url, headers=headers)
        
        if pdf_response.status_code == 200:
            # Save the PDF content
            with open(pdf_filename, "wb") as pdf_file:
                pdf_file.write(pdf_response.content)
            print(f"Downloaded {pdf_filename}")
        else:
            print(f"Failed to download {pdf_url}. Status code: {pdf_response.status_code}")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Downloaded pdf downloads/Generation-Charge_July2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2024_Web-Upload_revised0618.pdf
Downloaded pdf downloads/Generation-Charge_May-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Apr-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Mar-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Feb-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Jan-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Dec-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Nov-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Oct-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Sep-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_August-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_July-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge-f

### **Processing**

In [79]:
pdf_path = "pdf downloads/GenCharge_Mar2020.pdf"

# Read all tables from the specified page(s)
tables = tabula.read_pdf(pdf_path, lattice=True, pages='all', multiple_tables=True)

# Ensure there is at least one table
if not tables:
    raise ValueError("No tables found in the PDF.")

# Process the first table (assuming it has headers)
first_table = tables[0]

# Extract column names from the first table
first_table_columns = first_table.columns

# Initialize a list to store processed DataFrames
processed_tables = [first_table]

# Process each subsequent table
for table in tables[1:]:
    # Convert the header row of the current table to a DataFrame row
    table_header_as_row = pd.DataFrame([table.columns.tolist()], columns=table.columns)
    
    # Append the header row to the current table data
    table_with_header_as_row = pd.concat([table_header_as_row, table], ignore_index=True)
    
    # Check if the number of columns matches
    if len(table_with_header_as_row.columns) != len(first_table_columns):
        # Align columns by adding missing columns with NaN values
        missing_cols = len(first_table_columns) - len(table_with_header_as_row.columns)
        if missing_cols > 0:
            # Add missing columns with NaN values
            table_with_header_as_row = pd.concat(
                [table_with_header_as_row, pd.DataFrame(columns=[f"Column_{i+1}" for i in range(missing_cols)])], 
                axis=1
            )
        elif missing_cols < 0:
            # Truncate extra columns
            table_with_header_as_row = table_with_header_as_row.iloc[:, :len(first_table_columns)]
    
    # Rename columns of the current table to match the first table
    table_with_header_as_row.columns = first_table_columns
    
    # Append the table to the list of processed tables
    processed_tables.append(table_with_header_as_row)

# Combine all tables into a single DataFrame, stacking vertically
df = pd.concat(processed_tables, ignore_index=True)

df

Unnamed: 0.1,GENERATION CHARGE for MARCH 2020,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Power Supplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Basic Generation\rCost\rPhP,Other Cost\rAdjustment\rPhP,Total Generation\rCost for the Month\rPhP,Average\rGeneration Rate\rPhP/KWH
1,,(A),(B),(C),(D = B+C),(D/A),
2,BILATERAL CONTRACTS,,,,,,
3,1. Panay Energy Development Corporation,71.28%,32192445.03,208336450.66,-,208336450.66,6.4716
4,2. Panay Power Corporation,0.48%,218721.00,22666456.41,-,22666456.41,103.6318
5,3. KEPCO SPC Power Corporation,6.38%,2880000.00,13436640.00,-,13436640.00,4.6655
6,"4. Aboitiz Power Renewables, Incorporated",0.00%,-,-,-,-,-
7,Subtotal - Emergency Power Supply Contracts,78.15%,35291166.03,244439547.07,-,244439547.07,6.9264
8,Wholesale Electricity Spot Market (WESM),21.85%,9869469.07,17764519.34,-,"17,764,519.34.1",1.7999
9,Export Energy from Net Metering Customers,0.00%,-,-,-,-,-


In [80]:
# Function to find the column name where a keyword is present in any of its rows
def find_column_by_keyword(df, keyword):
    mask = df.apply(lambda col: col.str.contains(keyword, case=False, na=False)).any()
    return df.columns[mask]

# Identifying the columns that contain the specified keywords in any of their rows
columns_to_include = [
    find_column_by_keyword(df, 'CONTRACTS')[0],
    find_column_by_keyword(df, '% Share')[0],
    find_column_by_keyword(df, 'Purchased')[1],  # Get the second column with "Purchased"
    find_column_by_keyword(df, 'Average')[0],
]

# Creating a new DataFrame with the selected columns
sliced_cols_df = df[columns_to_include]

sliced_cols_df

Unnamed: 0.1,GENERATION CHARGE for MARCH 2020,Unnamed: 0,Unnamed: 1,Unnamed: 5
0,Power Supplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Average\rGeneration Rate\rPhP/KWH
1,,(A),(B),
2,BILATERAL CONTRACTS,,,
3,1. Panay Energy Development Corporation,71.28%,32192445.03,6.4716
4,2. Panay Power Corporation,0.48%,218721.00,103.6318
5,3. KEPCO SPC Power Corporation,6.38%,2880000.00,4.6655
6,"4. Aboitiz Power Renewables, Incorporated",0.00%,-,-
7,Subtotal - Emergency Power Supply Contracts,78.15%,35291166.03,6.9264
8,Wholesale Electricity Spot Market (WESM),21.85%,9869469.07,1.7999
9,Export Energy from Net Metering Customers,0.00%,-,-


In [81]:
# Find the index of the row containing the word "Supply" in the first column
start_index = sliced_cols_df[sliced_cols_df.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index[0] + 1

# Find the index of the row containing the word "Market" in the first column
end_index = sliced_cols_df[
    sliced_cols_df.iloc[:, 0].str.contains(r'WESM', case=False, na=False)
].index[0]

# Slice the DataFrame to include rows between start_index and end_index (inclusive)
sliced_rows_df = sliced_cols_df.iloc[start_index:end_index + 1]

# Exclude the row that contains the word "Metering" in the first column
sliced_rows_df = sliced_rows_df[~sliced_rows_df.iloc[:, 0].str.contains('Metering', case=False, na=False)]

sliced_rows_df.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
3,1. Panay Energy Development Corporation,71.28%,32192445.03,6.4716
4,2. Panay Power Corporation,0.48%,218721.00,103.6318
5,3. KEPCO SPC Power Corporation,6.38%,2880000.00,4.6655
6,"4. Aboitiz Power Renewables, Incorporated",0.00%,-,-
7,Subtotal - Emergency Power Supply Contracts,78.15%,35291166.03,6.9264
8,Wholesale Electricity Spot Market (WESM),21.85%,9869469.07,1.7999


In [82]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_df
sliced_rows_df.iloc[:, 0] = sliced_rows_df.iloc[:, 0].apply(clean_power_supplier)

sliced_rows_df = sliced_rows_df.reset_index(drop=True)
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,Panay Energy Development Corporation,71.28%,32192445.03,6.4716
1,Panay Power Corporation,0.48%,218721.00,103.6318
2,KEPCO SPC Power Corporation,6.38%,2880000.00,4.6655
3,"Aboitiz Power Renewables, Incorporated",0.00%,-,-
4,Subtotal - Emergency Power Supply Contracts,78.15%,35291166.03,6.9264
5,Wholesale Electricity Spot Market (WESM),21.85%,9869469.07,1.7999


In [91]:
# Find the column that contains the word "Average"
average_column = sliced_cols_df.columns[sliced_cols_df.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]

# Find rows where the first column contains the word "TOTAL"
total_row = sliced_cols_df.loc[sliced_cols_df.iloc[:, 0].str.contains("TOTAL", case=False, na=False), average_column]
reversed_row = total_row[::-1]

for value in reversed_row:
    if pd.notnull(value):
        total_row_value = value
        break  # Exit loop once a non-null value is found