In [3]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [4]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [5]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for AEC's webpage.

In [1]:
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://angeleselectric.com.ph/generation-charge/"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the raw HTML to a file
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    
    print("HTML content has been saved successfully.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

HTML content has been saved successfully.


In [16]:
# Path to the HTML file
html_file_path = 'generation_charge.html'

# Directory to save downloaded PDFs
save_dir = 'pdf downloads'
os.makedirs(save_dir, exist_ok=True)

# Load the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all <a> tags with text containing "PDF"
links = soup.find_all('a', string=re.compile(r'PDF', re.IGNORECASE))

# Check if any links are found
if not links:
    print("No links found with the text containing 'PDF'.")

# Function to sanitize file names
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

# Download each PDF
for link in links:
    pdf_url = link['href']
    
    # Find the closest preceding <strong> tag
    strong_tag = link.find_previous('strong')
    if strong_tag:
        file_name = strong_tag.get_text(strip=True)
    else:
        file_name = link.text.strip().replace(' ', '_')
    
    sanitized_file_name = sanitize_filename(file_name) + '.pdf'
    file_path = os.path.join(save_dir, sanitized_file_name)
    
    # Download the PDF file
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download: {pdf_url}')

print('Download completed.')


Downloaded: pdf downloads/July 2024 Generation Charge.pdf
Downloaded: pdf downloads/June 2024 Generation Charge.pdf
Downloaded: pdf downloads/May 2024 Generation Charge.pdf
Downloaded: pdf downloads/April 2024 Generation Charge.pdf
Downloaded: pdf downloads/March 2024 Generation Charge.pdf
Downloaded: pdf downloads/February 2024 Generation Charge.pdf
Downloaded: pdf downloads/January 2024 Generation Charge.pdf
Downloaded: pdf downloads/December 2023 Generation Charge.pdf
Downloaded: pdf downloads/November 2023 Generation Charge.pdf
Downloaded: pdf downloads/October 2023 Generation Charge.pdf
Downloaded: pdf downloads/September 2023 Generation Charge.pdf
Downloaded: pdf downloads/August 2023 Generation Charge.pdf
Downloaded: pdf downloads/July 2023 Generation Charge.pdf
Downloaded: pdf downloads/June 2023 Generation Charge.pdf
Downloaded: pdf downloads/May 2023 Generation Charge.pdf
Downloaded: pdf downloads/April 2023 Generation Charge.pdf
Downloaded: pdf downloads/March 2023 Generatio

### **Processing**

In [43]:
df = tabula.read_pdf("pdf downloads/September 2022 Generation Charge.pdf", lattice = True, pages=1)[0]

In [44]:
df

Unnamed: 0.1,Unnamed: 0,(A),(B),Unnamed: 1,(C),(D),(E = C + D)
0,SOURCES,Kwh Purchased,% Share,,Basic Generation Cost\r(PhP),O t h e r Cost\r1Adjustments\r(PhP),Total Generation Cost\r(PhP)
1,1. GNPower Mariveles (Bilateral Contract),38552993,66.6%,,"P 381,648,700.68",-,"P 381,648,700.68"
2,,,,,,,
3,2. Anda Power Corp. (Bilateral Contract),11160000,19.3%,,89519843.52,,89519843.52
4,,,,,,,
5,3. Angeles Power Inc. (Bilateral Contract),224337,0.39%,,18954395.51,-,18954395.51
6,,,,,,,
7,4. WESM (Spot Market),7776090,13.4%,,67897693.92,1446724.43,69344418.35
8,,,,,,,
9,5. Net Metering (Others),167760,0.290%,,1605228.32,-,1605228.32


In [41]:
#Filtering columns
columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|kWh').any()]
df_new = df[columns_to_keep]

# Define the list of keywords to filter by
keywords = ['Contract', 'TOTAL', 'WESM']

# Filter rows where the first column contains any of the keywords
df_new = df_new[df_new.iloc[:, 0].str.contains('|'.join(keywords), case=False, na=False)]

if len(df_new.columns) == 3:
    df_new.columns = ['Power Supplier','kWh','Average Generation Cost']
#elif len(df_new.columns) == 2:
    #If there are only two columns, compute for the missing column
    #Clean columns C and E

In [42]:
def clean_power_supplier(name):
    cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
    cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
    return cleaned_name.strip()

def clean_avg_gen_cost(cost):
    return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
df_new['Average Generation Cost'] = df_new['Average Generation Cost'].apply(clean_avg_gen_cost)

df_new

Unnamed: 0,Power Supplier,kWh,Average Generation Cost
1,GNPower Mariveles,31371946,6.2608
2,Anda Power Corp.,10765000,7.5842
3,WESM,18663790,4.6638
4,Angeles Power Inc.,114862,155.77
9,TOTAL,61504739,6.2787


**Notes**
- Computed column is inluded only from Oct 2022 onwards