In [3]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [4]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [5]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for AEC's webpage.

In [1]:
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://angeleselectric.com.ph/generation-charge/"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the raw HTML to a file
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    
    print("HTML content has been saved successfully.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

HTML content has been saved successfully.


In [16]:
# Path to the HTML file
html_file_path = 'generation_charge.html'

# Directory to save downloaded PDFs
save_dir = 'pdf downloads'
os.makedirs(save_dir, exist_ok=True)

# Load the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all <a> tags with text containing "PDF"
links = soup.find_all('a', string=re.compile(r'PDF', re.IGNORECASE))

# Check if any links are found
if not links:
    print("No links found with the text containing 'PDF'.")

# Function to sanitize file names
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

# Download each PDF
for link in links:
    pdf_url = link['href']
    
    # Find the closest preceding <strong> tag
    strong_tag = link.find_previous('strong')
    if strong_tag:
        file_name = strong_tag.get_text(strip=True)
    else:
        file_name = link.text.strip().replace(' ', '_')
    
    sanitized_file_name = sanitize_filename(file_name) + '.pdf'
    file_path = os.path.join(save_dir, sanitized_file_name)
    
    # Download the PDF file
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download: {pdf_url}')

print('Download completed.')


Downloaded: pdf downloads/July 2024 Generation Charge.pdf
Downloaded: pdf downloads/June 2024 Generation Charge.pdf
Downloaded: pdf downloads/May 2024 Generation Charge.pdf
Downloaded: pdf downloads/April 2024 Generation Charge.pdf
Downloaded: pdf downloads/March 2024 Generation Charge.pdf
Downloaded: pdf downloads/February 2024 Generation Charge.pdf
Downloaded: pdf downloads/January 2024 Generation Charge.pdf
Downloaded: pdf downloads/December 2023 Generation Charge.pdf
Downloaded: pdf downloads/November 2023 Generation Charge.pdf
Downloaded: pdf downloads/October 2023 Generation Charge.pdf
Downloaded: pdf downloads/September 2023 Generation Charge.pdf
Downloaded: pdf downloads/August 2023 Generation Charge.pdf
Downloaded: pdf downloads/July 2023 Generation Charge.pdf
Downloaded: pdf downloads/June 2023 Generation Charge.pdf
Downloaded: pdf downloads/May 2023 Generation Charge.pdf
Downloaded: pdf downloads/April 2023 Generation Charge.pdf
Downloaded: pdf downloads/March 2023 Generatio

### **Processing**

In [21]:
df = tabula.read_pdf("pdf downloads/February 2024 Generation Charge.pdf", lattice = True, pages=1)[0]

In [22]:
df

Unnamed: 0.1,Unnamed: 0,(A),(B),Unnamed: 1,(C),(D),(E = C + D),(F = E/A)
0,SOURCES,Net kWh Input,% Share,,Basic Generation Cost\r(PhP),Other Cost\rA d j u s t m ents\r(Discou...,Total Generation Cost\r(PhP),Average\rG e n e r a t i o n\rCost\r(P...
1,1. GNPower Mariveles (Bilateral Contract),31371946,51.01%,,"P 200,521,700.90","( 4,108,949.55)","P 196,412,751.36",P 6.2608
2,2. Anda Power Corp. (Bilateral Contract),10765000,17.50%,,82013093.93,"(368,934.34)",81644159.59,7.5842
3,4. WESM (Spot Market),18663790,30.35%,,86779908.43,264405.19,87044313.62,4.6638
4,3. Angeles Power Inc. (Bilateral Contract),114862,0.19%,,17892134.25,-,17892134.25,155.77
5,5. Net Metering (Export),589141,0.96%,,3759073.06,-,3759073.06,6.3806
6,Others:,,,,,,-,
7,Pilferage Recoveries,,,,,,"( 585,399.75)",( 0.0010)
8,,,,,,,,
9,TOTAL,61504739,100.00%,,"P 390,965,910.57","(4,213,478.70)","P 386,167,032.13",P 6.2787


**Notes**
- include column containing SOURCES, kWh
- If there are already three columns, it means that the computed column is included
- Computed column is inluded only from Oct 2022 onwards
- keywords for slicing: Contract, WESM, TOTAL