In [11]:
import os
import pandas as pd
import tabula

In [3]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [6]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [7]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for the webpage of a single file.

In [8]:
import requests

# URL of the webpage
url = "https://www.ceneco.ph/generation-rate/january-2021"

# Define headers to mimic a web browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Get the HTML content of the page
    html_content = response.text

    # Save the HTML content to a file
    with open("january_2021_source.html", "w", encoding='utf-8') as file:
        file.write(html_content)
    
    print("Source code downloaded successfully!")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Source code downloaded successfully!


In [9]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Create the folder for downloads if it doesn't exist
download_folder = "pdf downloads"
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Base URL format
base_url = "https://www.ceneco.ph/generation-rate/{}-{}"

# Month and year to start from
start_date = datetime(2020, 1, 1)
end_date = datetime.today()

# Headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
}

# Function to download PDF from a URL
def download_pdf(pdf_url):
    pdf_response = requests.get(pdf_url, headers=headers)
    pdf_response.raise_for_status()
    file_name = os.path.join(download_folder, pdf_url.split('/')[-1])
    with open(file_name, 'wb') as file:
        file.write(pdf_response.content)
    print(f"Downloaded: {file_name}")

# Function to check for PDFs on a given URL
def check_for_pdfs(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        link_tags = soup.find_all('a', href=True)
        pdf_urls = []

        for link_tag in link_tags:
            href = link_tag['href']
            if ".pdf" in href:
                pdf_url = href.split('file=')[1].split('&')[0]
                pdf_urls.append(pdf_url)

        return pdf_urls

    except requests.exceptions.HTTPError as e:
        print(f"HTTPError for URL {url}: {e}")
        return []

# Loop through the months and years
current_date = start_date

while current_date <= end_date:
    # Format the month and year for the URL
    month = current_date.strftime("%B").lower()  # e.g., 'january'
    year = current_date.strftime("%Y")  # e.g., '2020'
    
    # Generate the URL
    url = base_url.format(month, year)
    
    # Check for PDFs on the correct URL
    pdf_urls = check_for_pdfs(url)
    
    if not pdf_urls and month == 'january':
        # Handle the specific misspelling for January
        typo_url = base_url.format('janauary', year)
        pdf_urls = check_for_pdfs(typo_url)
        if pdf_urls:
            print(f"Found PDF with misspelling for January {year}.")
    
    if pdf_urls:
        for pdf_url in pdf_urls:
            download_pdf(pdf_url)
        # Move to the next month
        current_date = current_date.replace(day=28) + timedelta(days=4)
        current_date = current_date.replace(day=1)
    else:
        # Check the next 5 months
        has_valid_pdf = False
        for _ in range(5):
            current_date = current_date.replace(day=28) + timedelta(days=4)
            current_date = current_date.replace(day=1)
            if current_date > end_date:
                break
                
            next_month = current_date.strftime("%B").lower()
            next_year = current_date.strftime("%Y")
            next_url = base_url.format(next_month, next_year)
            
            next_pdf_urls = check_for_pdfs(next_url)
            if next_pdf_urls:
                has_valid_pdf = True
                break
        
        if not has_valid_pdf:
            print(f"No valid PDF found for {month.capitalize()}-{year} or the next 5 months. Ending the loop.")
            break

Downloaded: pdf downloads/January-2020.pdf
Downloaded: pdf downloads/February-2020.pdf
Downloaded: pdf downloads/March-2020.pdf
Downloaded: pdf downloads/April-2020.pdf
Downloaded: pdf downloads/May-2020.pdf
Downloaded: pdf downloads/June-2020.pdf
Downloaded: pdf downloads/July-2020.pdf
Downloaded: pdf downloads/August-2020.pdf
Downloaded: pdf downloads/September-2020.pdf
Downloaded: pdf downloads/October-2020.pdf
Downloaded: pdf downloads/November-2020.pdf
Downloaded: pdf downloads/December-2020.pdf
Downloaded: pdf downloads/January-2021-rotated-1.pdf
Downloaded: pdf downloads/February-2021-rotated-1.pdf
Downloaded: pdf downloads/Generation-Charge-for-March-2021-rotated-1.pdf
Downloaded: pdf downloads/GENERATION-CHARGE-JUNE-2021-1.pdf
Downloaded: pdf downloads/Comp.-of-Gen.-Charge-Nov.-2021.pdf
Downloaded: pdf downloads/Generation-Charge-December-2021.pdf
Downloaded: pdf downloads/Generation-Charge-January-2022.pdf
Downloaded: pdf downloads/GR-JAN.pdf
Downloaded: pdf downloads/GR-FEB.

Note: April and May 2021 are both images.

In [26]:
# Check which files are actually machine readable by trying to pass them through tabula

df = tabula.read_pdf("pdf downloads/July-2024-Generation-Rate.pdf",stream = True, pages=1)[0]

In [27]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,(A),(B),(C),(E= B + C),(F = E/A )
0,,Source,,% to Total,,,Other cost adjustment,,Average Gen
1,,,,,,Basic Generation,"(DAA, NSS, and other",Total Gen Cost for the month,Cost for June
2,,,,kWh,kWh Purchased,,,,
3,,,,,,Cost (PhP),billing Adjustment,(PhP),2024
4,,,,Purchased,,,,,
5,,,,,,,(PhP),,(PhP/kWh)
6,A. Power Supply Agreement (PSAs),,Dispatch,,,,,,
7,1. Green Core Geothermal Inc. (GCGI),,,15.67%,14880000,95849520.00,-,"9 5,849,520.00",6.4415
8,2. Energy Development Corporation (EDC),,,6.07%,5760000,36288000.00,-,"3 6,288,000.00",6.3000
9,3. Palm Concepcion Power Corp. (PCPC),,,27.43%,26040000,205710307.77,-,"2 05,710,307.77",7.8998


In [28]:
# Function to find the column name where a keyword is present in any of its rows
def find_column_by_keyword(df, keyword):
    mask = df.apply(lambda col: col.str.contains(keyword, case=False, na=False)).any()
    return df.columns[mask]

# Identifying the columns that contain the specified keywords in any of their rows
columns_to_include = [
    find_column_by_keyword(df, 'Supply')[0],
    find_column_by_keyword(df, '%')[0],
    find_column_by_keyword(df, 'kWh Purchased')[0],
    find_column_by_keyword(df, 'Average')[0],
]

# Creating a new DataFrame with the selected columns
sliced_cols_df = df[columns_to_include]

sliced_cols_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 3,(A),(F = E/A )
0,,% to Total,,Average Gen
1,,,,Cost for June
2,,kWh,kWh Purchased,
3,,,,2024
4,,Purchased,,
5,,,,(PhP/kWh)
6,A. Power Supply Agreement (PSAs),,,
7,1. Green Core Geothermal Inc. (GCGI),15.67%,14880000,6.4415
8,2. Energy Development Corporation (EDC),6.07%,5760000,6.3000
9,3. Palm Concepcion Power Corp. (PCPC),27.43%,26040000,7.8998


In [29]:
# Assuming df is your original DataFrame

# Find the index of the row containing the word "Supply" in the first column
start_index = sliced_cols_df[sliced_cols_df.iloc[:, 0].str.contains('Supply', case=False, na=False)].index[0] + 1

# Find the index of the row containing the word "Market" in the first column
end_index = sliced_cols_df[sliced_cols_df.iloc[:, 0].str.contains('Market', case=False, na=False)].index[0]

# Slice the DataFrame to include rows between start_index and end_index (inclusive)
sliced_rows_df = sliced_cols_df.iloc[start_index:end_index + 1]

# Exclude the row that contains the word "Metering" in the first column
sliced_rows_df = sliced_rows_df[~sliced_rows_df.iloc[:, 0].str.contains('Metering', case=False, na=False)]

#sliced_rows_df.columns = ["Power Supplier", "%", ""]

sliced_rows_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 3,(A),(F = E/A )
7,1. Green Core Geothermal Inc. (GCGI),15.67%,14880000,6.4415
8,2. Energy Development Corporation (EDC),6.07%,5760000,6.3
9,3. Palm Concepcion Power Corp. (PCPC),27.43%,26040000,7.8998
10,4. CENPRI/ENERGREEN (Peaking & Reserve),0.91%,860500,
12,Subtotal:,50.85%,48269214,7.653
13,B. Wholesale Electricity Spot Market,49.15%,46663490,10.4393


**Notes**

1. Need to take the Generation Rate (GR) Net