In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
%pip install -q tabula-py
%pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

In [5]:
# Only Jan 2018 to Nov 2022 is machine readable
# Criteria for downloading: more than 2 nbsp and href link contains nebula
# tabula.read_pdf(... , pages=all)

In [70]:
import os
import requests
from bs4 import BeautifulSoup

# URL to be scraped
url = "https://www.akelco.com.ph/rates.html"

# Make a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all strong tags
    strong_tags = soup.find_all('strong')
    
    base_url = "http://nebula.wsimg.com"
    
    def is_valid_pdf_link(tag):
        # Check for the 'a' tag directly under 'strong' or nested within it
        link_tag = tag.find_parent('a') or tag.find('a')
        if link_tag and '-' in tag.get_text() and 'href' in link_tag.attrs and 'nebula' in link_tag['href']:
            return True
        return False
    
    # List to store valid PDF links and their corresponding names
    pdf_links = []
    
    # Iterate over strong tags and check if they meet the criteria
    for strong in strong_tags:
        link_tag = strong.find_parent('a') or strong.find('a')
        if link_tag and is_valid_pdf_link(strong):
            text = ' '.join(strong.get_text().split())  # Remove unnecessary spaces
            pdf_links.append((link_tag['href'], text))
    
    # Download the PDFs
    download_directory = 'pdf downloads'
    os.makedirs(download_directory, exist_ok=True)
    
    for link, text in pdf_links:
        # Ensure the link has the correct scheme
        if not link.startswith('http'):
            full_url = 'https:' + link
        else:
            full_url = link
        response = requests.get(full_url)
        if response.status_code == 200:
            # Properly format file name
            pdf_name = text.replace(' ', '_') + '.pdf'
            with open(os.path.join(download_directory, pdf_name), 'wb') as pdf_file:
                pdf_file.write(response.content)
                print(f'Downloaded: {pdf_name}')
        else:
            print(f"Failed to download {text}: {response.status_code}")
    
    print('PDFs have been downloaded successfully.')
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Downloaded: OCTOBER_2022_-_NOVEMBER_2022.pdf
Downloaded: ​APRIL_2022_-_SEPTEMBER_2022.pdf
Downloaded: NOVEMBER_2020_-_MARCH_2022.pdf
Downloaded: ​JUNE_2020_-_OCTOBER_2020.pdf
Downloaded: JANUARY_2018_-_MAY_2020.pdf
PDFs have been downloaded successfully.


### **Processing**

In [219]:
from dateutil import parser
from dateutil.relativedelta import relativedelta

# Define the folder containing the PDF files
pdf_folder = 'pdf downloads'

# Initialize an empty list to hold processed DataFrames
processed_dfs = []

# Helper function to parse dates from filename
def extract_date_range_from_filename(filename):
    filename = filename.replace(".pdf", "").replace("_", " ")
    date_parts = filename.split("-")
    start_date = parser.parse(date_parts[0].strip())
    end_date = parser.parse(date_parts[1].strip())
    return start_date, end_date

# Loop through each PDF file in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.lower().endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        
        # Read the PDF into a list of DataFrames, one per page
        dataframes = tabula.read_pdf(pdf_path, lattice=True, pages='all')
        
        try:
            # Extract date range from filename
            start_date, end_date = extract_date_range_from_filename(pdf_file)
            
            # Generate dates for each page
            num_pages = len(dataframes)
            dates = [start_date + relativedelta(months=i) for i in range(num_pages)]
            
            for df, date in zip(dataframes, dates):
                try:
                    # Ensure the necessary column exists
                    if 'SOURCE' not in df.columns:
                        continue  # Skip this DataFrame if it doesn't have the 'SOURCE' column

                    # Find the index of the row containing "CONTRACT"
                    contract_index = df[df['SOURCE'].str.contains('CONTRACT', case=False, na=False)].index[0]

                    # Find the index of the row containing "IEMOP"
                    iemop_index = df[df['SOURCE'].str.contains('IEMOP', case=False, na=False)].index[0]

                    # Slice the DataFrame
                    df_new = df.loc[contract_index + 1:iemop_index]

                    # Identify the columns to keep
                    columns_to_keep = ['SOURCE']  # Start with the first column
                    columns_to_keep += [col for col in df_new.columns if '( A )' in col or '[ D / A ]' in col]

                    df_new = df_new[columns_to_keep]
                    df_new.columns = ['Power Supplier', 'kWh', 'Average Generation Cost']

                    # Get the 'Generation Charge' value
                    total_row = df[df.iloc[:, 0].eq('TOTAL') | df.iloc[:, 1].eq('TOTAL')]

                    if not total_row.empty:
                        # Try to get the value from the last column
                        if total_row.shape[1] > 1:
                            rate_value = (total_row.iloc[:, -1].values[0] if total_row.shape[1] > 1 else None)
                            if pd.isna(rate_value):  # Check if the last column value is NaN
                                rate_value = (total_row.iloc[:, -2].values[0] if total_row.shape[1] > 2 else None)
                                if pd.isna(rate_value):  # Check if the next-to-last column value is NaN
                                    rate_value = (total_row.iloc[:, -3].values[0] if total_row.shape[1] > 3 else None)
                        else:
                            rate_value = None
                    else:
                        rate_value = None

                    df_new["Generation Charge"] = rate_value

                    # Add the Date column
                    df_new["Date"] = date.strftime("%b-%Y")

                    # Append the processed DataFrame to the list
                    processed_dfs.append(df_new)

                except Exception as e:
                    print("PDF cannot be read properly.")
                    continue

        except Exception as e:
            print(f"Error processing file {pdf_file}: {e}")
            continue

# Concatenate all processed DataFrames into a single DataFrame
big_df = pd.concat(processed_dfs, ignore_index=True)


Error processing file ​APRIL_2022_-_SEPTEMBER_2022.pdf: Unknown string format: ​APRIL 2022
Error processing file ​JUNE_2020_-_OCTOBER_2020.pdf: Unknown string format: ​JUNE 2020
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.
PDF cannot be read properly.


In [218]:
big_df

Unnamed: 0,Power Supplier,kWh,Average Generation Cost,Generation Charge,Date
0,GCGI,13192790,5.7857,9.5509,Nov-2022
1,PPC,4000,1448.0563,9.5509,Nov-2022
2,PEDC,"6 ,768,569",11.7418,9.5509,Nov-2022
3,PCPC,"3 ,287,000",18.0578,9.5509,Nov-2022
4,IEMOP,"8 ,798,500",9.6972,9.5509,Nov-2022
5,GCGI,12716000,5.7857,9.7002,Jan-2023
6,PPC,18300,326.7243,9.7002,Jan-2023
7,PEDC,"6 ,116,000",11.0525,9.7002,Jan-2023
8,PCPC,"3 ,667,000",18.1512,9.7002,Jan-2023
9,IEMOP,"8 ,357,030",10.2797,9.7002,Jan-2023
