In [3]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [4]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [5]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for AEC's webpage.

In [1]:
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://angeleselectric.com.ph/generation-charge/"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the raw HTML to a file
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    
    print("HTML content has been saved successfully.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

HTML content has been saved successfully.


In [16]:
# Path to the HTML file
html_file_path = 'generation_charge.html'

# Directory to save downloaded PDFs
save_dir = 'pdf downloads'
os.makedirs(save_dir, exist_ok=True)

# Load the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all <a> tags with text containing "PDF"
links = soup.find_all('a', string=re.compile(r'PDF', re.IGNORECASE))

# Check if any links are found
if not links:
    print("No links found with the text containing 'PDF'.")

# Function to sanitize file names
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

# Download each PDF
for link in links:
    pdf_url = link['href']
    
    # Find the closest preceding <strong> tag
    strong_tag = link.find_previous('strong')
    if strong_tag:
        file_name = strong_tag.get_text(strip=True)
    else:
        file_name = link.text.strip().replace(' ', '_')
    
    sanitized_file_name = sanitize_filename(file_name) + '.pdf'
    file_path = os.path.join(save_dir, sanitized_file_name)
    
    # Download the PDF file
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download: {pdf_url}')

print('Download completed.')


Downloaded: pdf downloads/July 2024 Generation Charge.pdf
Downloaded: pdf downloads/June 2024 Generation Charge.pdf
Downloaded: pdf downloads/May 2024 Generation Charge.pdf
Downloaded: pdf downloads/April 2024 Generation Charge.pdf
Downloaded: pdf downloads/March 2024 Generation Charge.pdf
Downloaded: pdf downloads/February 2024 Generation Charge.pdf
Downloaded: pdf downloads/January 2024 Generation Charge.pdf
Downloaded: pdf downloads/December 2023 Generation Charge.pdf
Downloaded: pdf downloads/November 2023 Generation Charge.pdf
Downloaded: pdf downloads/October 2023 Generation Charge.pdf
Downloaded: pdf downloads/September 2023 Generation Charge.pdf
Downloaded: pdf downloads/August 2023 Generation Charge.pdf
Downloaded: pdf downloads/July 2023 Generation Charge.pdf
Downloaded: pdf downloads/June 2023 Generation Charge.pdf
Downloaded: pdf downloads/May 2023 Generation Charge.pdf
Downloaded: pdf downloads/April 2023 Generation Charge.pdf
Downloaded: pdf downloads/March 2023 Generatio

### **Processing**

In [166]:
df = tabula.read_pdf("pdf downloads/April 2021 Generation Charge.pdf", lattice = True, pages=1)[0]

In [167]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,(A),(B),Unnamed: 2,(C),(D),(E = C + D),Unnamed: 3
0,,SOURCES,Kwh Purchased,% Share,,Basic Generation Cost\r(PhP),Other Cost\r\r1 Adjustments\r(PhP),Total Generation\rCost\r(PhP),
1,,1. GNPower Mariveles (Bilateral Contract),17956233,36.0%,,"P 83,062,474.18",-,"P 83,062,474.18",
2,,,,,,,,,
3,,2. Angeles Power Inc. (Bilateral Contract),21026,0.0%,,17402383.56,,17402383.56,
4,,,,,,,,,
5,,3. Anda Power Corp. (Bilateral Contract),6720000,13.5%,,38002727.75,-,38002727.75,
6,,,,,,,,,
7,,4. WESM (Spot Market),25092580,50.4%,,112775178.41,963451.95,113738630.36,
8,,,,,,,,,
9,,5. Net Metering (Others),42553,0.085%,,194075.67,-,194075.67,


In [168]:
#Filtering columns
columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Kwh|Input').any()]
df_new = df[columns_to_keep]

# Define the list of keywords to filter by 
# Add 'TOTAL' if needed
keywords = ['Contract', 'WESM']

# Function to find the column containing 'SOURCES'
def find_column_with_sources(df):
    for col in df.columns:
        if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
            return col
    return None

# Find the column with 'SOURCES' in df
col_name_df = find_column_with_sources(df)

if col_name_df:
    # Ensure the identified column values are strings
    df[col_name_df] = df[col_name_df].astype(str)
    
    # Filter rows where the identified column contains any of the keywords
    df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
    df = df[df[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]

def clean_power_supplier(name):
    cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
    cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
    return cleaned_name.strip()

def clean_avg_gen_cost(cost):
    return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

if len(df_new.columns) == 3:
    df_new.columns = ['Power Supplier','kWh','Average Generation Cost']
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
    df_new['Average Generation Cost'] = df_new['Average Generation Cost'].apply(clean_avg_gen_cost)

elif len(df_new.columns) == 2:
    # Identify the column with header containing "E"
    col_name = next(col for col in df.columns if 'E' in col)

    # Remove leading "P", commas, and extra spaces, then convert to numeric
    df[col_name] = df[col_name].str.lstrip('P').str.replace(',', '').str.strip().astype(float)

    # Convert "(A)" column to numeric
    df_new['(A)'] = df_new['(A)'].str.replace(',', '').str.strip().astype(float)

    # Calculate "Average Generation Cost"
    df_new['Average Generation Cost'] = df[col_name] / df_new['(A)']
    df_new.columns = ['Power Supplier','kWh','Average Generation Cost']
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].str.lstrip('P').str.replace(',', '').str.strip().astype(float)


In [169]:
df_new

Unnamed: 0,Power Supplier,kWh,Average Generation Cost
1,GNPower Mariveles,17956233.0,4.62583
3,Angeles Power Inc.,21026.0,827.660209
5,Anda Power Corp.,6720000.0,5.655168
7,WESM,25092580.0,4.532759


**Notes**
- Computed column is inluded only from Oct 2022 onwards