In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for AEC's webpage.

In [5]:
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://angeleselectric.com.ph/generation-charge/"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the raw HTML to a file
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    
    print("HTML content has been saved successfully.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

HTML content has been saved successfully.


In [16]:
# Path to the HTML file
html_file_path = 'generation_charge.html'

# Directory to save downloaded PDFs
save_dir = 'pdf downloads'
os.makedirs(save_dir, exist_ok=True)

# Load the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all <a> tags with text containing "PDF"
links = soup.find_all('a', string=re.compile(r'PDF', re.IGNORECASE))

# Check if any links are found
if not links:
    print("No links found with the text containing 'PDF'.")

# Function to sanitize file names
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

# Download each PDF
for link in links:
    pdf_url = link['href']
    
    # Find the closest preceding <strong> tag
    strong_tag = link.find_previous('strong')
    if strong_tag:
        file_name = strong_tag.get_text(strip=True)
    else:
        file_name = link.text.strip().replace(' ', '_')
    
    sanitized_file_name = sanitize_filename(file_name) + '.pdf'
    file_path = os.path.join(save_dir, sanitized_file_name)
    
    # Download the PDF file
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download: {pdf_url}')

print('Download completed.')


Downloaded: pdf downloads/July 2024 Generation Charge.pdf
Downloaded: pdf downloads/June 2024 Generation Charge.pdf
Downloaded: pdf downloads/May 2024 Generation Charge.pdf
Downloaded: pdf downloads/April 2024 Generation Charge.pdf
Downloaded: pdf downloads/March 2024 Generation Charge.pdf
Downloaded: pdf downloads/February 2024 Generation Charge.pdf
Downloaded: pdf downloads/January 2024 Generation Charge.pdf
Downloaded: pdf downloads/December 2023 Generation Charge.pdf
Downloaded: pdf downloads/November 2023 Generation Charge.pdf
Downloaded: pdf downloads/October 2023 Generation Charge.pdf
Downloaded: pdf downloads/September 2023 Generation Charge.pdf
Downloaded: pdf downloads/August 2023 Generation Charge.pdf
Downloaded: pdf downloads/July 2023 Generation Charge.pdf
Downloaded: pdf downloads/June 2023 Generation Charge.pdf
Downloaded: pdf downloads/May 2023 Generation Charge.pdf
Downloaded: pdf downloads/April 2023 Generation Charge.pdf
Downloaded: pdf downloads/March 2023 Generatio

### **Processing**

In [58]:
import pandas as pd
import tabula
import re
import numpy as np
import os

# Function to process each PDF file
def process_pdf(file_path):
    df = tabula.read_pdf(file_path, stream=True, pages=1)[0]
    columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Kwh|Input|Purchased|Share|Average').any()]
    df_new = df[columns_to_keep]

    keywords = ['Contract', 'WESM']

    def find_column_with_sources(df):
        for col in df.columns:
            if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
                return col
        return None

    col_name_df = find_column_with_sources(df)

    if col_name_df:
        df[col_name_df] = df[col_name_df].astype(str)
        df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
        df_sliced = df.loc[df_new.index]

    def clean_power_supplier(name):
        cleaned_name = re.sub(r'^\d+\.\s*', '', name)
        cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)
        return cleaned_name.strip()

    def clean_avg_gen_cost(cost):
        return re.sub(r'^P\s*', '', cost).strip()

    if len(df_new.columns) == 4:
        df_new.columns = ['Power Supplier','kWh','%','Average Generation Cost']
        df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
        df_new['Average Generation Cost'] = df_new['Average Generation Cost'].apply(clean_avg_gen_cost)

    elif len(df_new.columns) == 2:
        col_name = next(col for col in df.columns if 'E' in col)
        df_sliced[col_name] = df_sliced[col_name].str.lstrip('P').str.replace(',', '').str.strip()
        df_sliced[col_name] = df_sliced[col_name].replace('-', np.nan)
        df_sliced[col_name] = pd.to_numeric(df_sliced[col_name], errors='coerce')
        df_new['(A)'] = df_new['(A)'].str.replace(r'[,\s]', '', regex=True)
        df_new['(A)'] = df_new['(A)'].replace('-', np.nan)
        df_new['(A)'] = pd.to_numeric(df_new['(A)'], errors='coerce')
        df_new['Average Generation Cost'] = df_sliced[col_name] / df_new['(A)']
        df_new.columns = ['Power Supplier','kWh','Average Generation Cost']
        df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

    charge_rows = df[df[col_name_df].str.contains('for', case=False, na=False)]

    if not charge_rows.empty:
        charge_row = charge_rows.iloc[-1]
        for value in reversed(charge_row[2:]):
            if not pd.isna(value):
                rate_value = re.sub(r'^P\s*|[,]', '', value).strip()
                break
        else:
            rate_value = None
    else:
        rate_value = None

    df_new['Generation Charge'] = rate_value
    return df_new

# Initialize an empty DataFrame for combining results
big_df = pd.DataFrame()

# Directory containing the PDF files
folder_path = "pdf downloads"

# Loop through all PDF files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(folder_path, file_name)
        df_new = process_pdf(file_path)
        
        # Extract the date from the file name and format it
        date_str = re.search(r'(\w+)\s(\d{4})', file_name)
        if date_str:
            month_name, year = date_str.groups()
            month_abbr = pd.to_datetime(month_name, format='%B').strftime('%b')
            formatted_date = f"{month_abbr}-{year}"
        else:
            formatted_date = None
        
        df_new['Date'] = formatted_date
        
        # Append the processed data to big_df
        big_df = pd.concat([big_df, df_new], ignore_index=True)

# Reset index and display the combined DataFrame
big_df.reset_index(drop=True, inplace=True)

  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in reversed(charge_row[2:]):
  for value in r

In [25]:
big_df.head()

Unnamed: 0.1,Unnamed: 0,(A),(B),Generation Charge,Date,Power Supplier,kWh,%,Average Generation Cost
0,1. GNPower Mariveles (Bilateral Contract),37692892.0,65.6%,5.9543,Oct-2021,,,,
1,2. Anda Power Corp.. (Bilateral Contract),7440000.0,13.0%,5.9543,Oct-2021,,,,
2,3. Angeles Power Inc. (Bilateral Contract),22060.0,0.04%,5.9543,Oct-2021,,,,
3,4. WESM (Spot Market),12200760.0,21.2%,5.9543,Oct-2021,,,,
4,,,,10.8367,Nov-2022,GNPower Mariveles,33473073.0,56.6%,10.8863


#### Creating Supplier Dataframe

In [331]:
unique_suppliers = big_df['Power Supplier'].unique()

unique_suppliers

array(['GNPower Mariveles', 'Angeles Power Inc.', 'WESM',
       'Anda Power Corp.', 'Anda Power Corp..'], dtype=object)

In [332]:
# Correct the duplicate entry in the 'Power Supplier' column
big_df['Power Supplier'] = big_df['Power Supplier'].replace('Anda Power Corp..', 'Anda Power Corp.')

unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers


array(['GNPower Mariveles', 'Angeles Power Inc.', 'WESM',
       'Anda Power Corp.'], dtype=object)

In [334]:
# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Supplier', 'Power Supplier ID'])

supplier_df

Unnamed: 0,Power Supplier,Power Supplier ID
0,GNPower Mariveles,1
1,Angeles Power Inc.,2
2,WESM,3
3,Anda Power Corp.,4


In [336]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Supplier'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Ensure IDs are not converted to floats
#big_df['Power Supplier ID'] = big_df['Power Supplier ID'].astype(int)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Supplier'])

big_df.head()

Unnamed: 0,kWh,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,29061060.0,4.684679,5.6304,Jan-2020,1
1,2928356.0,13.676752,5.6304,Jan-2020,2
2,13726510.0,6.10888,5.6304,Jan-2020,3
3,7200000.0,5.455071,5.6304,Jan-2020,4
4,7440000.0,5.413379,4.9722,Feb-2020,4


In [337]:
%pip install openpyxl

with pd.ExcelWriter("Historical_AEC_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

Note: you may need to restart the kernel to use updated packages.


#### for troubleshooting of loop

Average Generation Cost is present only from October 2022 onwards.

In [147]:
df = tabula.read_pdf("pdf downloads/June 2024 Generation Charge.pdf", stream = True, pages=1)[0]

In [148]:
df

Unnamed: 0.1,Unnamed: 0,(A),(B),(C),(D),(E = C + D),(F = E/A)
0,,,,,Other Cost,,Average
1,,,,Basic Generation Cost,Ad j u s t m ents,Total Generation Cost,Ge n e r a t i o n
2,SOURCES,Net kWh Input,% Share,,,,
3,,,,(PhP),(Discounts & Other,(PhP),Cost
4,,,,,Adj.)(PhP),,(Php/kWh)
5,1. GNPower Mariveles (Bilateral Contract),40872989,47.28%,"P2 64,947,931.49","( 11,977,467.57)","P252,970,463.92",P6.1892
6,2. Anda Power Corp. (Bilateral Contract),10800000,12.49%,78683444.91,"(405,690.96)","7 8,277,753.95",7.2479
7,4. WESM (Spot Market),33991529,39.32%,77932195.99,"(159,208.02)","7 7,772,987.97",2.2880
8,3. Angeles Power Inc. (Bilateral Contract),-,0.00%,-,-,-,-
9,5. Net Metering (Export),787330,0.91%,5528080.30,-,5528080.30,7.0213


In [160]:
import pandas as pd
import re

# Function to find the column containing 'SOURCES'
def find_column_with_sources(df):
    for col in df.columns:
        if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
            return col
    return None

# Function to find columns containing a keyword
def find_columns_with_keywords(df, keywords):
    columns = [col for col in df.columns if df[col].astype(str).str.contains('|'.join(keywords), case=False, na=False).any()]
    return columns

# Function to clean the 'Power Supplier' column
def clean_power_supplier(name):
    cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
    cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
    return cleaned_name.strip()

# Function to clean the 'Average Generation Cost' column
def clean_avg_gen_cost(cost):
    return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

# Check for the presence of "Average"
contains_average = df.astype(str).apply(lambda x: x.str.contains('PhP/kWh', case=False, na=False)).any().any()

# Define the initial columns to keep based on keywords
if contains_average:
    # If 'Average' is present, exclude columns with 'Total'
    columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Input|Purchased|Share|PhP/kWh|Php/kWh').any()]
else:
    # If 'Average' is not present, include columns with 'Total'
    columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Input|Purchased|Share|Total').any()]

# Filter the DataFrame to include only the columns to keep
df_new = df[columns_to_keep]

# Find the column with 'SOURCES' in df
col_name_df = find_column_with_sources(df)

if col_name_df:
    # Ensure the identified column values are strings
    df[col_name_df] = df[col_name_df].astype(str)
    
    # Filter rows where the identified column contains any of the keywords
    keywords = ['Contract', 'WESM']
    df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
    
    # Slice df so that indices in df and df_new match
    df_sliced = df.loc[df_new.index]

if contains_average:
    # Rename columns as needed
    df_new.columns = ['Power Supplier', 'kWh', '%', 'Average Generation Cost']
    
    # Find the last column containing values with '.'
    avg_col = df_new.columns[df_new.apply(lambda col: col.astype(str).str.contains('\.', na=False).any())].tolist()[-1]

    # Clean the relevant columns
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
    df_new['Average Generation Cost'] = df_new[avg_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
    df_new['Average Generation Cost'] = pd.to_numeric(df_new['Average Generation Cost'], errors='coerce')

        
elif not contains_average:
    df_new.columns = ['Power Supplier', 'kWh', '%', 'Total Generation Cost']
    
    # If "Average" is not present, find the last column with '.' for Total Generation Cost
    total_col = df_new.columns[df_new.apply(lambda col: col.astype(str).str.contains('\.', na=False).any())].tolist()[-1]

    # Ensure 'kWh' column is properly named or present
    kwh_col = 'kWh'  # Adjust if needed to match your DataFrame column names
    
    # Check if the kWh column exists
    if kwh_col in df_new.columns:
        # Clean and convert columns
        df_new[total_col] = df_new[total_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
        df_new[total_col] = pd.to_numeric(df_new[total_col], errors='coerce')
        df_new[kwh_col] = df_new[kwh_col].astype(str).str.replace(r'[^\d.]', '', regex=True)
        df_new[kwh_col] = pd.to_numeric(df_new[total_col], errors='coerce')
        
        # Compute Average Generation Cost
        df_new['Average Generation Cost'] = df_new[total_col] / df_new[kwh_col]
        
        # Drop Total Generation Cost column
        df_new = df_new.drop(columns=[total_col])
        
        # Clean columns as needed
        df_new.columns = ['Power Supplier', 'kWh', '%', 'Average Generation Cost']
        df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

df_new

True


Unnamed: 0,Power Supplier,kWh,%,Average Generation Cost
5,GNPower Mariveles,40872989,47.28%,6.1892
6,Anda Power Corp.,10800000,12.49%,7.2479
7,WESM,33991529,39.32%,2.288
8,Angeles Power Inc.,-,0.00%,


In [146]:
# Get the rows containing 'CHARGE'
charge_rows = df[df[col_name_df].str.contains('for', case=False, na=False)]

if not charge_rows.empty:
    # Select the last row from charge_rows
    charge_row = charge_rows.iloc[-1]
    
    # Find the last non-NaN value in the selected row
    for value in reversed(charge_row[2:]):
        if not pd.isna(value):
            # Clean the value: remove leading 'P', commas, and extra spaces
            rate_value = re.sub(r'^P\s*|[,]', '', value).strip()
            break
    else:
        rate_value = None
else:
    rate_value = None

# Assign the cleaned value to 'Generation Charge' in df_new
df_new['Generation Charge'] = rate_value
df_new

  for value in reversed(charge_row[2:]):


Unnamed: 0,Power Supplier,kWh,%,Average Generation Cost,Generation Charge
5,GNPower Mariveles,252970500.0,47.28%,1.0,4.7925
6,Anda Power Corp.,78277750.0,12.49%,1.0,4.7925
7,WESM,77772990.0,39.32%,1.0,4.7925
8,Angeles Power Inc.,,0.00%,,4.7925
