In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for AEC's webpage.

In [5]:
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://angeleselectric.com.ph/generation-charge/"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the raw HTML to a file
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    
    print("HTML content has been saved successfully.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

HTML content has been saved successfully.


In [16]:
# Path to the HTML file
html_file_path = 'generation_charge.html'

# Directory to save downloaded PDFs
save_dir = 'pdf downloads'
os.makedirs(save_dir, exist_ok=True)

# Load the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all <a> tags with text containing "PDF"
links = soup.find_all('a', string=re.compile(r'PDF', re.IGNORECASE))

# Check if any links are found
if not links:
    print("No links found with the text containing 'PDF'.")

# Function to sanitize file names
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

# Download each PDF
for link in links:
    pdf_url = link['href']
    
    # Find the closest preceding <strong> tag
    strong_tag = link.find_previous('strong')
    if strong_tag:
        file_name = strong_tag.get_text(strip=True)
    else:
        file_name = link.text.strip().replace(' ', '_')
    
    sanitized_file_name = sanitize_filename(file_name) + '.pdf'
    file_path = os.path.join(save_dir, sanitized_file_name)
    
    # Download the PDF file
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download: {pdf_url}')

print('Download completed.')


Downloaded: pdf downloads/July 2024 Generation Charge.pdf
Downloaded: pdf downloads/June 2024 Generation Charge.pdf
Downloaded: pdf downloads/May 2024 Generation Charge.pdf
Downloaded: pdf downloads/April 2024 Generation Charge.pdf
Downloaded: pdf downloads/March 2024 Generation Charge.pdf
Downloaded: pdf downloads/February 2024 Generation Charge.pdf
Downloaded: pdf downloads/January 2024 Generation Charge.pdf
Downloaded: pdf downloads/December 2023 Generation Charge.pdf
Downloaded: pdf downloads/November 2023 Generation Charge.pdf
Downloaded: pdf downloads/October 2023 Generation Charge.pdf
Downloaded: pdf downloads/September 2023 Generation Charge.pdf
Downloaded: pdf downloads/August 2023 Generation Charge.pdf
Downloaded: pdf downloads/July 2023 Generation Charge.pdf
Downloaded: pdf downloads/June 2023 Generation Charge.pdf
Downloaded: pdf downloads/May 2023 Generation Charge.pdf
Downloaded: pdf downloads/April 2023 Generation Charge.pdf
Downloaded: pdf downloads/March 2023 Generatio

### **Processing**

In [199]:
import os
import pandas as pd
import re
import tabula

# Function to process a single PDF file
def process_pdf(file_path):
    # Read the PDF file
    df = tabula.read_pdf(file_path, stream=True, pages=1)[0]

    # Function to find the column containing 'SOURCES'
    def find_column_with_sources(df):
        for col in df.columns:
            if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
                return col
        return None

    # Function to find columns containing a keyword
    def find_columns_with_keywords(df, keywords):
        columns = [col for col in df.columns if df[col].astype(str).str.contains('|'.join(keywords), case=False, na=False).any()]
        return columns

    # Function to clean the 'Power Supplier' column
    def clean_power_supplier(name):
        cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
        cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
        return cleaned_name.strip()

    # Function to clean the 'Average Generation Cost' column
    def clean_avg_gen_cost(cost):
        return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

    # Check for the presence of "Average"
    contains_average = df.astype(str).apply(lambda x: x.str.contains('PhP/kWh|Php/kWh', case=False, na=False)).any().any()

    # Define the initial columns to keep based on keywords
    if contains_average:
        columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Input|Purchased|Share|PhP/kWh|Php/kWh').any()]
    else:
        columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Input|Purchased|Share|Total').any()]

    # Filter the DataFrame to include only the columns to keep
    df_new = df[columns_to_keep]

    # Find the column with 'SOURCES' in df
    col_name_df = find_column_with_sources(df)

    if col_name_df:
        df[col_name_df] = df[col_name_df].astype(str)
        keywords = ['Contract', 'WESM']
        df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
        df_sliced = df.loc[df_new.index]

    if contains_average:
        df_new.columns = ['Power Supplier', 'kWh', '%', 'Average Generation Cost']
        avg_col = df_new.columns[df_new.apply(lambda col: col.astype(str).str.contains('\.', na=False).any())].tolist()[-1]
        df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
        df_new['Average Generation Cost'] = df_new[avg_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
        df_new['Average Generation Cost'] = pd.to_numeric(df_new['Average Generation Cost'], errors='coerce')
    else:
        df_new.columns = ['Power Supplier', 'kWh', '%', 'Total Generation Cost']
        last_dot_columns = df.columns[df.apply(lambda col: col.astype(str).str.contains('\.', na=False).any())].tolist()
        total_col = None
        for col in last_dot_columns:
            if df[col].astype(str).str.contains(r'\(PhP\)', case=False, na=False).any():
                total_col = col
                break
        if not total_col:
            if len(last_dot_columns) >= 2:
                total_col = last_dot_columns[-2]
            else:
                total_col = last_dot_columns[-1]
        kwh_col = 'kWh'
        if kwh_col in df_new.columns:
            df_new['Total Generation Cost'] = df_new['Total Generation Cost'].astype(str).str.replace(r'[^\d.]', '', regex=True)
            df_new['Total Generation Cost'] = pd.to_numeric(df_new['Total Generation Cost'], errors='coerce')
            df_new[kwh_col] = df_new[kwh_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
            df_new[kwh_col] = pd.to_numeric(df_new[kwh_col], errors='coerce')
            df_new['Average Generation Cost'] = df_new['Total Generation Cost'] / df_new[kwh_col]
            df_new = df_new.drop(columns=['Total Generation Cost'])
            df_new.columns = ['Power Supplier', 'kWh', '%', 'Average Generation Cost']
            df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

    charge_rows = df[df[col_name_df].str.contains('for', case=False, na=False)]
    if not charge_rows.empty:
        charge_row = charge_rows.iloc[-1]
        for value in reversed(charge_row[2:]):
            if not pd.isna(value):
                rate_value = re.sub(r'^P\s*|[,]', '', value).strip()
                break
        else:
            rate_value = None
    else:
        rate_value = None

    df_new['Generation Charge'] = rate_value
    
    return df_new

# Directory containing PDF files
pdf_dir = "pdf downloads"

# Initialize an empty DataFrame to store results
big_df = pd.DataFrame()

# Process each PDF file in the directory
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(pdf_dir, file_name)
        
        # Process the PDF and get the DataFrame
        processed_df = process_pdf(file_path)
        
        # Extract the date from the file name
        month_year = file_name.split(' ')[0:2]  # Get the Month and Year
        month = month_year[0]
        year = month_year[1]
        # Map month names to abbreviations
        month_map = {
            "January": "Jan", "February": "Feb", "March": "Mar", "April": "Apr", "May": "May", "June": "Jun",
            "July": "Jul", "August": "Aug", "September": "Sep", "October": "Oct", "November": "Nov", "December": "Dec"
        }
        month_abbr = month_map.get(month, month)  # Default to month if not found
        date_str = f"{month_abbr}-{year}"
        
        # Add the Date column
        processed_df['Date'] = date_str
        
        # Append to big_df
        big_df = pd.concat([big_df, processed_df], ignore_index=True)

big_df

  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in r

Unnamed: 0,Power Supplier,kWh,%,Average Generation Cost,Generation Charge,Date
0,GNPower Mariveles,37692892,65.6%,6.139583,5.9543,Oct-2021
1,Anda Power Corp..,7440000,13.0%,6.583905,5.9543,Oct-2021
2,Angeles Power Inc.,22060,0.04%,793.415525,5.9543,Oct-2021
3,WESM,12200760,21.2%,4.826696,5.9543,Oct-2021
4,GNPower Mariveles,33473073,56.6%,10.8863,10.8367,Nov-2022
5,Anda Power Corp.,10743750,18.2%,9.7757,10.8367,Nov-2022
6,WESM,14460080,24.4%,10.3573,10.8367,Nov-2022
7,Angeles Power Inc.,292224,0.5%,68.2395,10.8367,Nov-2022
8,GNPower Mariveles,38653942,66.9%,10.6644,10.8979,Dec-2022
9,Anda Power Corp.,11160000,19.3%,9.7483,10.8979,Dec-2022


In [200]:
big_df.head()

Unnamed: 0,Power Supplier,kWh,%,Average Generation Cost,Generation Charge,Date
0,GNPower Mariveles,37692892,65.6%,6.139583,5.9543,Oct-2021
1,Anda Power Corp..,7440000,13.0%,6.583905,5.9543,Oct-2021
2,Angeles Power Inc.,22060,0.04%,793.415525,5.9543,Oct-2021
3,WESM,12200760,21.2%,4.826696,5.9543,Oct-2021
4,GNPower Mariveles,33473073,56.6%,10.8863,10.8367,Nov-2022


#### Creating Supplier Dataframe

In [201]:
unique_suppliers = big_df['Power Supplier'].unique()

unique_suppliers

array(['GNPower Mariveles', 'Anda Power Corp..', 'Angeles Power Inc.',
       'WESM', 'Anda Power Corp.'], dtype=object)

In [202]:
# Correct the duplicate entry in the 'Power Supplier' column
big_df['Power Supplier'] = big_df['Power Supplier'].replace('Anda Power Corp..', 'Anda Power Corp.')

unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers


array(['GNPower Mariveles', 'Anda Power Corp.', 'Angeles Power Inc.',
       'WESM'], dtype=object)

In [203]:
# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Supplier', 'Power Supplier ID'])

supplier_df

Unnamed: 0,Power Supplier,Power Supplier ID
0,GNPower Mariveles,1
1,Anda Power Corp.,2
2,Angeles Power Inc.,3
3,WESM,4


In [204]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Supplier'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Ensure IDs are not converted to floats
#big_df['Power Supplier ID'] = big_df['Power Supplier ID'].astype(int)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Supplier'])

big_df.head()

Unnamed: 0,kWh,%,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,37692892,65.6%,6.139583,5.9543,Oct-2021,1
1,7440000,13.0%,6.583905,5.9543,Oct-2021,2
2,22060,0.04%,793.415525,5.9543,Oct-2021,3
3,12200760,21.2%,4.826696,5.9543,Oct-2021,4
4,33473073,56.6%,10.8863,10.8367,Nov-2022,1


In [205]:
%pip install openpyxl

with pd.ExcelWriter("Historical_AEC_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

Note: you may need to restart the kernel to use updated packages.


#### for troubleshooting of loop

Average Generation Cost is present only from October 2022 onwards.

In [195]:
df = tabula.read_pdf("pdf downloads/March 2023 Generation Charge.pdf", stream = True, pages=1)[0]

In [196]:
df

Unnamed: 0.1,Unnamed: 0,(A),(B),(C),Unnamed: 1,(D),Unnamed: 2,(E = C + D),Unnamed: 3
0,,,,,,,,,Average Gen.
1,,,,Basic Generation Cost,Ot h e,rC ost,,Total Generation Cost,
2,SOURCES,kWh Purchased,% Share,(PhP),Adjustments,1 (PhP),,(PhP),Cost
3,,,,,,,,,(PhP/kWh)
4,1. GNPower Mariveles (Bilateral Contract),36846837,69.5%,"P3 49,351,533.66","( 3,339,921.45)",,,"P346,011,612.21",9.3905
5,2. Anda Power Corp. (Bilateral Contract),11160000,21.0%,98365647.93,,"(379,058.37)",,97986589.56,8.7802
6,3. WESM (Spot Market),4613640,8.7%,"5 8,153,384.96",,"1 ,731,978.79",,59885363.75,12.9801
7,4. Angeles Power Inc. (Bilateral Contract),33150,0.1%,16882473.26,,-,,16882473.26,509.28
8,5. Net Metering Export Energy,375734,0.709%,3679375.91,,-,,3679375.91,9.7925
9,Total Generation Cost,53029361,100.00%,"P526,432,415.72","(1,987,001.03)",,,"P524,445,414.69",


In [197]:
import pandas as pd
import re

# Function to find the column containing 'SOURCES'
def find_column_with_sources(df):
    for col in df.columns:
        if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
            return col
    return None

# Function to find columns containing a keyword
def find_columns_with_keywords(df, keywords):
    columns = [col for col in df.columns if df[col].astype(str).str.contains('|'.join(keywords), case=False, na=False).any()]
    return columns

# Function to clean the 'Power Supplier' column
def clean_power_supplier(name):
    cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
    cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
    return cleaned_name.strip()

# Function to clean the 'Average Generation Cost' column
def clean_avg_gen_cost(cost):
    return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

# Check for the presence of "Average"
contains_average = df.astype(str).apply(lambda x: x.str.contains('PhP/kWh|Php/kWh', case=False, na=False)).any().any()

# Define the initial columns to keep based on keywords
if contains_average:
    # If 'Average' is present, exclude columns with 'Total'
    columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Input|Purchased|Share|PhP/kWh|Php/kWh').any()]
else:
    # If 'Average' is not present, include columns with 'Total'
    columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Input|Purchased|Share|Total').any()]

# Filter the DataFrame to include only the columns to keep
df_new = df[columns_to_keep]

# Find the column with 'SOURCES' in df
col_name_df = find_column_with_sources(df)

if col_name_df:
    # Ensure the identified column values are strings
    df[col_name_df] = df[col_name_df].astype(str)
    
    # Filter rows where the identified column contains any of the keywords
    keywords = ['Contract', 'WESM']
    df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
    
    # Slice df so that indices in df and df_new match
    df_sliced = df.loc[df_new.index]

if contains_average:
    # Rename columns as needed
    df_new.columns = ['Power Supplier', 'kWh', '%', 'Average Generation Cost']
    
    # Find the last column containing values with '.'
    avg_col = df_new.columns[df_new.apply(lambda col: col.astype(str).str.contains('\.', na=False).any())].tolist()[-1]

    # Clean the relevant columns
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
    df_new['Average Generation Cost'] = df_new[avg_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
    df_new['Average Generation Cost'] = pd.to_numeric(df_new['Average Generation Cost'], errors='coerce')

else:
    df_new.columns = ['Power Supplier', 'kWh', '%', 'Total Generation Cost']
    
    # If "Average" is not present, find the last column with '.' for Total Generation Cost
    last_dot_columns = df.columns[df.apply(lambda col: col.astype(str).str.contains('\.', na=False).any())].tolist()
    
    # Check for columns with '(PhP)'
    total_col = None
    for col in last_dot_columns:
        if df[col].astype(str).str.contains(r'\(PhP\)', case=False, na=False).any():
            total_col = col
            break
    
    # If no column contains '(PhP)', choose the second to the last column with '.'
    if not total_col:
        if len(last_dot_columns) >= 2:
            total_col = last_dot_columns[-2]
        else:
            total_col = last_dot_columns[-1]
    
    # Ensure 'kWh' column is properly named or present
    kwh_col = 'kWh'  # Adjust if needed to match your DataFrame column names
    
    # Check if the kWh column exists
    if kwh_col in df_new.columns:
        # Clean and convert columns
        df_new['Total Generation Cost'] = df_new['Total Generation Cost'].astype(str).str.replace(r'[^\d.]', '', regex=True)
        df_new['Total Generation Cost'] = pd.to_numeric(df_new['Total Generation Cost'], errors='coerce')
        df_new[kwh_col] = df_new[kwh_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
        df_new[kwh_col] = pd.to_numeric(df_new[kwh_col], errors='coerce')
        
        # Compute Average Generation Cost
        df_new['Average Generation Cost'] = df_new['Total Generation Cost'] / df_new[kwh_col]
        
        # Drop Total Generation Cost column
        df_new = df_new.drop(columns=['Total Generation Cost'])
        
        # Clean columns as needed
        df_new.columns = ['Power Supplier', 'kWh', '%', 'Average Generation Cost']
        df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

df_new

Unnamed: 0,Power Supplier,kWh,%,Average Generation Cost
4,GNPower Mariveles,36846837,69.5%,9.3905
5,Anda Power Corp.,11160000,21.0%,8.7802
6,WESM,4613640,8.7%,12.9801
7,Angeles Power Inc.,33150,0.1%,509.28


In [198]:
# Get the rows containing 'CHARGE'
charge_rows = df[df[col_name_df].str.contains('for', case=False, na=False)]

if not charge_rows.empty:
    # Select the last row from charge_rows
    charge_row = charge_rows.iloc[-1]
    
    # Find the last non-NaN value in the selected row
    for value in reversed(charge_row[2:]):
        if not pd.isna(value):
            # Clean the value: remove leading 'P', commas, and extra spaces
            rate_value = re.sub(r'^P\s*|[,]', '', value).strip()
            break
    else:
        rate_value = None
else:
    rate_value = None

# Assign the cleaned value to 'Generation Charge' in df_new
df_new['Generation Charge'] = rate_value
df_new

  for value in reversed(charge_row[2:]):


Unnamed: 0,Power Supplier,kWh,%,Average Generation Cost,Generation Charge
4,GNPower Mariveles,36846837,69.5%,9.3905,9.8853
5,Anda Power Corp.,11160000,21.0%,8.7802,9.8853
6,WESM,4613640,8.7%,12.9801,9.8853
7,Angeles Power Inc.,33150,0.1%,509.28,9.8853
