In [3]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [4]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [5]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for AEC's webpage.

In [1]:
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://angeleselectric.com.ph/generation-charge/"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Save the raw HTML to a file
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    
    print("HTML content has been saved successfully.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

HTML content has been saved successfully.


In [16]:
# Path to the HTML file
html_file_path = 'generation_charge.html'

# Directory to save downloaded PDFs
save_dir = 'pdf downloads'
os.makedirs(save_dir, exist_ok=True)

# Load the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all <a> tags with text containing "PDF"
links = soup.find_all('a', string=re.compile(r'PDF', re.IGNORECASE))

# Check if any links are found
if not links:
    print("No links found with the text containing 'PDF'.")

# Function to sanitize file names
def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

# Download each PDF
for link in links:
    pdf_url = link['href']
    
    # Find the closest preceding <strong> tag
    strong_tag = link.find_previous('strong')
    if strong_tag:
        file_name = strong_tag.get_text(strip=True)
    else:
        file_name = link.text.strip().replace(' ', '_')
    
    sanitized_file_name = sanitize_filename(file_name) + '.pdf'
    file_path = os.path.join(save_dir, sanitized_file_name)
    
    # Download the PDF file
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f'Downloaded: {file_path}')
    else:
        print(f'Failed to download: {pdf_url}')

print('Download completed.')


Downloaded: pdf downloads/July 2024 Generation Charge.pdf
Downloaded: pdf downloads/June 2024 Generation Charge.pdf
Downloaded: pdf downloads/May 2024 Generation Charge.pdf
Downloaded: pdf downloads/April 2024 Generation Charge.pdf
Downloaded: pdf downloads/March 2024 Generation Charge.pdf
Downloaded: pdf downloads/February 2024 Generation Charge.pdf
Downloaded: pdf downloads/January 2024 Generation Charge.pdf
Downloaded: pdf downloads/December 2023 Generation Charge.pdf
Downloaded: pdf downloads/November 2023 Generation Charge.pdf
Downloaded: pdf downloads/October 2023 Generation Charge.pdf
Downloaded: pdf downloads/September 2023 Generation Charge.pdf
Downloaded: pdf downloads/August 2023 Generation Charge.pdf
Downloaded: pdf downloads/July 2023 Generation Charge.pdf
Downloaded: pdf downloads/June 2023 Generation Charge.pdf
Downloaded: pdf downloads/May 2023 Generation Charge.pdf
Downloaded: pdf downloads/April 2023 Generation Charge.pdf
Downloaded: pdf downloads/March 2023 Generatio

### **Processing**

In [318]:
from datetime import datetime

# Directory containing the PDF files
pdf_folder = "pdf downloads"

# List all PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Initialize an empty DataFrame to hold all results
big_df = pd.DataFrame()

# Define the list of keywords to filter by
keywords = ['Contract', 'WESM']

# Function to find the column containing 'SOURCES'
def find_column_with_sources(df):
    for col in df.columns:
        if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
            return col
    return None

# Function to clean power supplier names
def clean_power_supplier(name):
    cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
    cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
    return cleaned_name.strip()

# Function to clean average generation cost
def clean_avg_gen_cost(cost):
    return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

# Function to extract and format date from filename
def extract_date_from_filename(filename):
    # Remove the leading directory and file extension
    filename = os.path.basename(filename).replace('.pdf', '')
    
    # Extract month and year
    match = re.search(r'(\w+)\s(\d{4})', filename)
    if match:
        month_str, year = match.groups()
        # Convert month name to abbreviated form
        try:
            month = datetime.strptime(month_str, '%B').strftime('%b')
        except ValueError:
            return None
        return f"{month}-{year}"
    return None

# Iterate through all PDF files
for pdf_file in pdf_files:
    try:
        # Read the PDF file
        file_path = os.path.join(pdf_folder, pdf_file)
        df = tabula.read_pdf(file_path, lattice=True, pages=1)[0]

        # Filter columns
        columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Kwh|Input|Purchased').any()]
        df_new = df[columns_to_keep]

        # Find the column with 'SOURCES'
        col_name_df = find_column_with_sources(df)

        if col_name_df:
            # Ensure the identified column values are strings
            df[col_name_df] = df[col_name_df].astype(str)
            
            # Filter rows where the identified column contains any of the keywords
            df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
            
            # Slice df so that indices in df and df_new match
            df_sliced = df.loc[df_new.index]

            if len(df_new.columns) == 3:
                df_new.columns = ['Power Supplier', 'kWh', 'Average Generation Cost']
                df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
                df_new['Average Generation Cost'] = df_new['Average Generation Cost'].apply(clean_avg_gen_cost)

            elif len(df_new.columns) == 2:
                # Identify the column with header containing "E"
                col_name = next(col for col in df.columns if 'E' in col)

                # Remove leading "P", commas, and extra spaces
                df_sliced[col_name] = df_sliced[col_name].str.lstrip('P').str.replace(',', '').str.strip()

                # Replace '-' with NaN, then convert to numeric, coercing errors to NaN
                df_sliced[col_name] = df_sliced[col_name].replace('-', np.nan)
                df_sliced[col_name] = pd.to_numeric(df_sliced[col_name], errors='coerce')

                # Convert "(A)" column to numeric
                df_new['(A)'] = df_new['(A)'].str.replace(r'[,\s]', '', regex=True)
                df_new['(A)'] = df_new['(A)'].replace('-', np.nan)
                df_new['(A)'] = pd.to_numeric(df_new['(A)'], errors='coerce')
                
                # Calculate "Average Generation Cost"
                df_new['Average Generation Cost'] = df_sliced[col_name] / df_new['(A)']
                df_new.columns = ['Power Supplier', 'kWh', 'Average Generation Cost']
                df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

            # Get the rows containing 'CHARGE'
            charge_rows = df[df[col_name_df].str.contains('for', case=False, na=False)]

            if not charge_rows.empty:
                # Select the last row from charge_rows
                charge_row = charge_rows.iloc[-1]
                
                # Find the last non-NaN value in the selected row
                for value in reversed(charge_row.iloc[2:]):
                    if not pd.isna(value):
                        # Clean the value: remove leading 'P', commas, and extra spaces
                        rate_value = re.sub(r'^P\s*|[,]', '', value).strip()
                        break
                else:
                    rate_value = None
            else:
                rate_value = None

            # Assign the cleaned value to 'Generation Charge' in df_new
            df_new['Generation Charge'] = rate_value

            # Extract date from filename and add to df_new
            date_str = extract_date_from_filename(pdf_file)
            df_new['Date'] = date_str

            # Append the processed DataFrame to big_df
            big_df = pd.concat([big_df, df_new], ignore_index=True)

    except Exception as e:
        print(f"Error processing file {pdf_file}: {e}")


  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row.iloc[2:]):
  for value in reversed(charge_row

In [319]:
big_df.head()

Unnamed: 0,Power Supplier,kWh,Average Generation Cost,Generation Charge,Date
0,GNPower Mariveles,37692892.0,6.139583,5.9543,Oct-2021
1,Anda Power Corp..,7440000.0,6.583905,5.9543,Oct-2021
2,Angeles Power Inc.,22060.0,793.415525,5.9543,Oct-2021
3,WESM,12200760.0,4.826696,5.9543,Oct-2021
4,GNPower Mariveles,33473073.0,10.886309,10.8367,Nov-2022


#### for troubleshooting of loop

In [308]:
df = tabula.read_pdf("pdf downloads/November 2022 Generation Charge.pdf", lattice = True, pages=1)[0]

In [309]:
df

Unnamed: 0.1,Unnamed: 0,(A),(B),Unnamed: 1,(C),(D),(E = C + D),Unnamed: 2
0,SOURCES,kWh Purchased,% Share,,Basic Generation Cost\r(PhP),O t h e r Cost\r1Adjustments\r(PhP),Total Generation Cost\r(PhP),Average Gen.\r\rCost\r(PhP/kWh)
1,1. GNPower Mariveles (Bilateral Contract),33473073,56.6%,,"P 376,348,392.13","( 11,950,165.83)","P 364,398,226.30",10.8863
2,,,,,,,,
3,2. Anda Power Corp. (Bilateral Contract),10743750,18.2%,,105427430.25,"(399,443.25)",105027987.00,9.7757
4,,,,,,,,
5,3. WESM (Spot Market),14460080,24.4%,,148312847.06,1454691.92,149767538.98,10.3573
6,,,,,,,,
7,4. Angeles Power Inc. (Bilateral Contract),292224,0.5%,,19941233.75,-,19941233.75,68.2395
8,,,,,,,,
9,5. Net Metering Export Energy,210624,0.356%,,2281310.54,-,2281310.54,10.8312


In [310]:
#Filtering columns
columns_to_keep = [col for col in df.columns if df[col].astype(str).str.contains('SOURCES|Kwh|Input|Purchased').any()]
df_new = df[columns_to_keep]

# Define the list of keywords to filter by 
# Add 'TOTAL' if needed
keywords = ['Contract', 'WESM']

# Function to find the column containing 'SOURCES'
def find_column_with_sources(df):
    for col in df.columns:
        if df[col].astype(str).str.contains('SOURCES', case=False, na=False).any():
            return col
    return None

# Find the column with 'SOURCES' in df, returns the column name where SOURCES is
col_name_df = find_column_with_sources(df)

if col_name_df:
    # Ensure the identified column values are strings
    df[col_name_df] = df[col_name_df].astype(str)
    
    # Filter rows where the identified column contains any of the keywords
    df_new = df_new[df_new[col_name_df].str.contains('|'.join(keywords), case=False, na=False)]
    
    # Slice df so that indices in df and df_new match
    df_sliced = df.loc[df_new.index]

def clean_power_supplier(name):
    cleaned_name = re.sub(r'^\d+\.\s*', '', name)  # Remove leading numbers and period
    cleaned_name = re.sub(r'\s*\(.*\)', '', cleaned_name)  # Remove text in parentheses
    return cleaned_name.strip()

def clean_avg_gen_cost(cost):
    return re.sub(r'^P\s*', '', cost).strip()  # Remove leading 'P' and any extra spaces

if len(df_new.columns) == 3:
    df_new.columns = ['Power Supplier','kWh','Average Generation Cost']
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)
    df_new['Average Generation Cost'] = df_new['Average Generation Cost'].apply(clean_avg_gen_cost)

elif len(df_new.columns) == 2:
    # Identify the column with header containing "E"
    col_name = next(col for col in df.columns if 'E' in col)

    # Remove leading "P", commas, and extra spaces
    df_sliced[col_name] = df_sliced[col_name].str.lstrip('P').str.replace(',', '').str.strip()

    # Replace '-' with NaN
    df_sliced[col_name] = df_sliced[col_name].replace('-', np.nan)

    # Convert to numeric, ignoring NaNs
    df_sliced[col_name] = pd.to_numeric(df_sliced[col_name], errors='coerce')

    # Remove commas and extra spaces
    df_new['(A)'] = df_new['(A)'].str.replace(r'[,\s]', '', regex=True)

    # Replace '-' with NaN
    df_new['(A)'] = df_new['(A)'].replace('-', np.nan)

    # Convert to numeric, ignoring NaNs
    df_new['(A)'] = pd.to_numeric(df_new['(A)'], errors='coerce')
    
    # Calculate "Average Generation Cost"
    df_new['Average Generation Cost'] = df_sliced[col_name] / df_new['(A)']
    df_new.columns = ['Power Supplier','kWh','Average Generation Cost']
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(clean_power_supplier)

In [311]:
# Get the rows containing 'CHARGE'
charge_rows = df[df[col_name_df].str.contains('for', case=False, na=False)]

if not charge_rows.empty:
    # Select the last row from charge_rows
    charge_row = charge_rows.iloc[-1]
    
    # Find the last non-NaN value in the selected row
    for value in reversed(charge_row[2:]):
        if not pd.isna(value):
            # Clean the value: remove leading 'P', commas, and extra spaces
            rate_value = re.sub(r'^P\s*|[,]', '', value).strip()
            break
    else:
        rate_value = None
else:
    rate_value = None

# Assign the cleaned value to 'Generation Charge' in df_new
df_new['Generation Charge'] = rate_value
df_new

  for value in reversed(charge_row[2:]):


Unnamed: 0,Power Supplier,kWh,Average Generation Cost,Generation Charge
1,GNPower Mariveles,33473073,10.886309,10.8367
3,Anda Power Corp.,10743750,9.775729,10.8367
5,WESM,14460080,10.357311,10.8367
7,Angeles Power Inc.,292224,68.239548,10.8367
