In [19]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [20]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [21]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [22]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

The following lines of code will download the relevant files from AKELCO's webpage. Note that only Jan 2018 to Nov 2022 is machine readable.

In [23]:
import os
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://www.akelco.com.ph/rates.html"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all strong tags
    strong_tags = soup.find_all('strong')
    
    base_url = "http://nebula.wsimg.com"
    
    def is_valid_pdf_link(tag):
        # Check for the 'a' tag directly under 'strong' or nested within it
        link_tag = tag.find_parent('a') or tag.find('a')
        if link_tag and '-' in tag.get_text() and 'href' in link_tag.attrs and 'nebula' in link_tag['href']:
            return True
        return False
    
    # List to store valid PDF links and their corresponding names
    pdf_links = []
    
    # Iterate over strong tags and check if they meet the criteria
    for strong in strong_tags:
        link_tag = strong.find_parent('a') or strong.find('a')
        if link_tag and is_valid_pdf_link(strong):
            text = ' '.join(strong.get_text().split())  # Remove unnecessary spaces
            pdf_links.append((link_tag['href'], text))
    
    # Download the PDFs
    download_directory = 'pdf downloads'
    os.makedirs(download_directory, exist_ok=True)
    
    for link, text in pdf_links:
        # Ensure the link has the correct scheme
        if not link.startswith('http'):
            full_url = 'https:' + link
        else:
            full_url = link
        response = requests.get(full_url)
        if response.status_code == 200:
            # Properly format file name
            pdf_name = text.replace(' ', '_') + '.pdf'
            with open(os.path.join(download_directory, pdf_name), 'wb') as pdf_file:
                pdf_file.write(response.content)
                print(f'Downloaded: {pdf_name}')
        else:
            print(f"Failed to download {text}: {response.status_code}")
    
    print('PDFs have been downloaded successfully.')
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Downloaded: OCTOBER_2022_-_NOVEMBER_2022.pdf
Downloaded: ​APRIL_2022_-_SEPTEMBER_2022.pdf
Downloaded: NOVEMBER_2020_-_MARCH_2022.pdf
Downloaded: ​JUNE_2020_-_OCTOBER_2020.pdf
Downloaded: JANUARY_2018_-_MAY_2020.pdf
PDFs have been downloaded successfully.


### **Processing**

In [31]:
from dateutil import parser
from dateutil.relativedelta import relativedelta

# Define the folder containing the PDF files
pdf_folder = 'pdf downloads'

# Initialize an empty list to hold processed DataFrames
processed_dfs = []

# Helper function to parse dates from filename
def extract_date_range_from_filename(filename):
    # Remove invisible characters
    filename = ''.join(c for c in filename if c.isprintable())
    filename = filename.replace(".pdf", "").replace("_", " ")
    date_parts = filename.split("-")
    start_date = parser.parse(date_parts[0].strip())
    end_date = parser.parse(date_parts[1].strip())
    return start_date, end_date

# Loop through each PDF file in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.lower().endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        
        # Read the PDF into a list of DataFrames, one per page
        dataframes = tabula.read_pdf(pdf_path, lattice=True, pages='all')
        
        try:
            # Extract date range from filename
            start_date, end_date = extract_date_range_from_filename(pdf_file)
            
            # Generate dates for each page
            num_pages = len(dataframes)
            dates = [start_date + relativedelta(months=i) for i in range(num_pages)]
            
            for df, date in zip(dataframes, dates):
                try:
                    # Ensure the necessary column exists
                    if 'SOURCE' not in df.columns:
                        continue

                    # Find the index of the row containing "CONTRACT"
                    contract_index = df[df['SOURCE'].str.contains('CONTRACT', case=False, na=False)].index[0]

                    # Find the index of the row containing "IEMOP"
                    iemop_index = df[df['SOURCE'].str.contains('IEMOP', case=False, na=False)].index[0]

                    # Slice the DataFrame
                    df_new = df.loc[contract_index + 1:iemop_index]

                    # Identify the columns to keep
                    columns_to_keep = ['SOURCE']  # Start with the first column
                    columns_to_keep += [col for col in df_new.columns if '( A )' in col or '[ D / A ]' in col or '%' in col]

                    df_new = df_new[columns_to_keep]
                    df_new.columns = ['Power Supplier','%', 'kWh', 'Average Generation Cost']

                    # Get the 'Generation Charge' value
                    total_row = df[df.iloc[:, 0].eq('TOTAL') | df.iloc[:, 1].eq('TOTAL')]

                    if not total_row.empty:
                        # Try to get the value from the last column
                        if total_row.shape[1] > 1:
                            rate_value = (total_row.iloc[:, -1].values[0] if total_row.shape[1] > 1 else None)
                            if pd.isna(rate_value):  # Check if the last column value is NaN
                                rate_value = (total_row.iloc[:, -2].values[0] if total_row.shape[1] > 2 else None)
                                if pd.isna(rate_value):  # Check if the next-to-last column value is NaN
                                    rate_value = (total_row.iloc[:, -3].values[0] if total_row.shape[1] > 3 else None)
                        else:
                            rate_value = None
                    else:
                        rate_value = None

                    df_new["Generation Charge"] = rate_value

                    # Add the Date column
                    df_new["Date"] = date.strftime("%b-%Y")
                    print(df_new)

                    # Append the processed DataFrame to the list
                    processed_dfs.append(df_new)

                except Exception as e:
                    print("PDF cannot be read properly.")
                    continue

        except Exception as e:
            print(f"Error processing file {pdf_file}: {e}")
            continue

# Concatenate all processed DataFrames into a single DataFrame
big_df = pd.concat(processed_dfs, ignore_index=True)

  Power Supplier       %         kWh Average Generation Cost  \
2           GCGI  38.94%  10,230,422                  5.7785   
3            PPC   0.00%           -                 #DIV/0!   
4           PEDC  22.04%  5 ,789,848                  9.8442   
5           PCPC  14.86%  3 ,905,266                 11.3017   
6          IEMOP  11.30%  2 ,967,540                  5.8793   

  Generation Charge      Date  
2            7.3671  May-2022  
3            7.3671  May-2022  
4            7.3671  May-2022  
5            7.3671  May-2022  
6            7.3671  May-2022  
  Power Supplier       %         kWh Average Generation Cost  \
2           GCGI  39.97%  12,387,000                  5.7857   
3            PPC   0.38%     118,200                 56.0278   
4           PEDC  18.45%  5 ,719,200                 10.5819   
5           PCPC   0.46%     142,000                 11.2790   
6          IEMOP  20.71%  6 ,417,380                  7.8556   

  Generation Charge      Date  
2     

In [32]:
big_df.head()

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge,Date
0,GCGI,38.94%,10230422,5.7785,7.3671,May-2022
1,PPC,0.00%,-,#DIV/0!,7.3671,May-2022
2,PEDC,22.04%,"5 ,789,848",9.8442,7.3671,May-2022
3,PCPC,14.86%,"3 ,905,266",11.3017,7.3671,May-2022
4,IEMOP,11.30%,"2 ,967,540",5.8793,7.3671,May-2022


#### Creating Supplier Dataframe

In [33]:
unique_suppliers = big_df['Power Supplier'].unique()

# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Supplier', 'Power Supplier ID'])

supplier_df.head()

Unnamed: 0,Power Supplier,Power Supplier ID
0,GCGI,1
1,PPC,2
2,PEDC,3
3,PCPC,4
4,IEMOP,5


In [34]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Supplier'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Ensure IDs are not converted to floats
#big_df['Power Supplier ID'] = big_df['Power Supplier ID'].astype(int)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Supplier'])

big_df.head()

Unnamed: 0,%,kWh,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,38.94%,10230422,5.7785,7.3671,May-2022,1
1,0.00%,-,#DIV/0!,7.3671,May-2022,2
2,22.04%,"5 ,789,848",9.8442,7.3671,May-2022,3
3,14.86%,"3 ,905,266",11.3017,7.3671,May-2022,4
4,11.30%,"2 ,967,540",5.8793,7.3671,May-2022,5


In [35]:
with pd.ExcelWriter("Historical_AKELCO_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

#### for troubleshooting

In [None]:
df = tabula.read_pdf("pdf downloads/​APRIL_2022_-_SEPTEMBER_2022.pdf", lattice = True, pages = 'all')[1]

In [None]:
df

Unnamed: 0.1,SOURCE,% to Total kWh\rPurchased,( A )\rkWh Purchased,( B )\rBasic Generation\rCost\r(PhP),"( C )\rOther Cost\rAdjustments\r(DAA, NSS, and\r\rOther Billing\rAdjustments)\r(PhP)",( D = B + C - PCR )\rTotal Generation\rCost for the Month\r(PhP),[ D / A ]\rAverage\rG e n e r a tion Cost\r(PhP/kWh),Unnamed: 0,Unnamed: 1
0,NPC - PSALM,12.83%,3370000.00,12517864.20,"(103,592.11)",12414272.09,3.6838,,
1,BILATERAL\rCONTRACT W/ IPPs,,,,,,,,
2,GCGI,38.94%,10230422,60077386.15,"(961,238.18)",59116147.97,5.7785,,
3,PPC,0.00%,-,5666800.00,,5666800.00,#DIV/0!,,
4,PEDC,22.04%,"5 ,789,848",57221078.08,"(224,900.93)",56996177.15,9.8442,,
5,PCPC,14.86%,"3 ,905,266",44195640.77,"(59,659.22)",44135981.56,11.3017,,
6,IEMOP,11.30%,"2 ,967,540",17447171.17,,17447171.17,5.8793,,
7,Net Metering,0.03%,9019,"6 5,386.85",,65386.85,7.2499,,
8,Less: Pilferage Cost\rRecoveries (PCR),,,,,"(32,334.54)",,,
9,SELF-GENERATION,,,,,,,,
