In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for CASURECO II's webpage.

In [5]:
import requests

url = 'https://www.casureco2.com.ph/support/rates?page=1'
response = requests.get(url)

if response.status_code == 200:
    with open('generation_charge.html', 'w', encoding='utf-8') as file:
        file.write(response.text)
    print("Webpage downloaded and saved as 'generation_charge.html'.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Webpage downloaded and saved as 'generation_charge.html'.


In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get('https://www.casureco2.com.ph/support/rates?page=1')

# Create a directory for storing PDFs if it doesn't exist
download_dir = 'pdf_downloads'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Function to download PDF
def download_pdf(pdf_url, save_path):
    response = requests.get(pdf_url)
    with open(save_path, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded: {save_path}")

# Function to process the current page
def process_current_page():
    # Wait for the table to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'biddings_table')))
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', {'id': 'biddings_table'})
    rows = table.find('tbody').find_all('tr')

    for row in rows:
        # Find the "View PDF" button and click it
        view_pdf_button = row.find('button', {'data-dropdown-toggle': True})
        if view_pdf_button:
            view_pdf_button_id = view_pdf_button['id']
            driver.find_element(By.ID, view_pdf_button_id).click()

            # Wait for the dropdown to be visible
            dropdown_id = f'dropdown{view_pdf_button_id.split("dropdownDefaultButton")[-1]}'
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, dropdown_id)))

            # Get the "Generation Charge" link and download the PDF
            dropdown = driver.find_element(By.ID, dropdown_id)
            gen_charge_link = dropdown.find_element(By.LINK_TEXT, 'Generation Charge').get_attribute('href')
            month = row.find_all('th')[0].text.strip()
            year = row.find_all('th')[1].text.strip()
            save_path = os.path.join(download_dir, f'{month}_{year}_Generation_Charge.pdf')
            download_pdf(gen_charge_link, save_path)

            # Close the dropdown by clicking again on "View PDF"
            driver.find_element(By.ID, view_pdf_button_id).click()

# Function to check if we are on the last page
def is_last_page():
    try:
        results_text = driver.find_element(By.XPATH, '//p[@class="text-sm text-gray-700 dark:text-white leading-5"]').text
        parts = results_text.split()
        showing_to = int(parts[3])  # This should be the second number in the text "Showing X to Y of Z results"
        total = int(parts[5])       # This should be the total number of results
        return showing_to == total
    except Exception as e:
        print(f"Error checking last page: {e}")
        return False

# Loop through pages
while True:
    process_current_page()
    
    # Check if there is a next page
    if is_last_page():
        break
    
    try:
        next_button = driver.find_element(By.XPATH, '/html/body/div/div[1]/main/section/div[2]/div/div[2]/nav/div[2]/div[2]/span/a[2]')
        next_button.click()
        time.sleep(2)  # Wait for the next page to load
    except Exception as e:
        print(f"An error occurred: {e}")
        break

# Close the driver
driver.quit()

Downloaded: pdf_downloads/July_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/June_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/May_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/April_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/March_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/February_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/January_2024_Generation_Charge.pdf
Downloaded: pdf_downloads/December_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/November_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/October_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/September_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/August_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/July_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/June_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/May_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/April_2023_Generation_Charge.pdf
Downloaded: pdf_downloads/March_2023_Generatio