In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

Download the source code for TARELCO II's webpage.

In [6]:
import requests

# URL of the website to scrape
url = "https://www.tarelco2.com/rates.html"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the content of the response (HTML source code)
    html_content = response.text
    
    # Save the HTML source code to a file
    with open("tarelco2_rates.html", "w", encoding="utf-8") as file:
        file.write(html_content)
    
    print("HTML source code has been downloaded and saved as 'tarelco2_rates.html'.")
else:
    print(f"Failed to retrieve the website. Status code: {response.status_code}")

HTML source code has been downloaded and saved as 'tarelco2_rates.html'.


In [8]:
import requests
from bs4 import BeautifulSoup
import os

# Load the HTML file
with open("tarelco2_rates.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Create a directory to save the PDF files
pdf_dir = "pdf downloads"
if not os.path.exists(pdf_dir):
    os.makedirs(pdf_dir)

# Find all the links in the "Generation Charge" section
for link in soup.find_all('a'):
    href = link.get('href')
    text = link.get_text(strip=True)

    # Check if the link text is a decimal/float value
    try:
        float(text)
        # If it's a float, download the PDF
        pdf_url = f"https://www.tarelco2.com/{href}"
        pdf_response = requests.get(pdf_url)
        
        # Save the PDF file
        pdf_name = os.path.join(pdf_dir, os.path.basename(href))
        with open(pdf_name, "wb") as pdf_file:
            pdf_file.write(pdf_response.content)
        
        print(f"Downloaded: {pdf_name}")
        
    except ValueError:
        # Skip if the text is not a decimal/float
        continue

Downloaded: pdf downloads/GenBreakdown2024-01.pdf
Downloaded: pdf downloads/GenBreakdown2024-02.pdf
Downloaded: pdf downloads/GenBreakdown2024-03.pdf
Downloaded: pdf downloads/GenBreakdown2024-04.pdf
Downloaded: pdf downloads/GenBreakdown2024-05.pdf
Downloaded: pdf downloads/GenBreakdown2024-06.pdf
Downloaded: pdf downloads/GenBreakdown2024-07.pdf
Downloaded: pdf downloads/GenBreakdown2023-01.pdf
Downloaded: pdf downloads/GenBreakdown2023-02.pdf
Downloaded: pdf downloads/GenBreakdown2023-03.pdf
Downloaded: pdf downloads/GenBreakdown2023-04.pdf
Downloaded: pdf downloads/GenBreakdown2023-05.pdf
Downloaded: pdf downloads/GenBreakdown2023-06.pdf
Downloaded: pdf downloads/GenBreakdown2023-07.pdf
Downloaded: pdf downloads/GenBreakdown2023-08.pdf
Downloaded: pdf downloads/GenBreakdown2023-09.pdf
Downloaded: pdf downloads/GenBreakdown2023-10.pdf
Downloaded: pdf downloads/GenBreakdown2023-11.pdf
Downloaded: pdf downloads/GenBreakdown2023-12.pdf
Downloaded: pdf downloads/GenBreakdown2022-01.pdf


### **Processing**

In [30]:
from datetime import datetime

# Initialize an empty DataFrame to hold all the data
big_df = pd.DataFrame()

# Function to process each PDF file
def process_pdf(file_path):
    # Read the PDF file
    df = tabula.read_pdf(file_path, lattice=True, pages=1)[0]
    
    # List of keywords to filter columns
    keywords = ["SOURCE", "%", "Purchased", "Average"]

    # Filter columns based on whether their names contain any of the keywords
    df_sliced_columns = df[[col for col in df.columns if any(keyword in col for keyword in keywords)]]

    # Rename columns
    df_sliced_columns.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

    # Check for leading numbers in the first column (assuming 'Power Supplier' is the first column here)
    df_filtered_leading_numbers = df_sliced_columns[df_sliced_columns['Power Supplier'].str.contains(r'^\d+', na=False)]

    # Skip rows where 'Power Supplier' contains the word "metering"
    df_filtered_leading_numbers = df_filtered_leading_numbers[~df_filtered_leading_numbers['Power Supplier'].str.contains('metering', case=False, na=False)]

    # Combine both filters
    df_new = pd.concat([df_filtered_leading_numbers]).drop_duplicates().copy()

    # Remove leading numbers and replace \r with space in the "Power Supplier" column values
    df_new['Power Supplier'] = df_new['Power Supplier'].apply(
        lambda x: re.sub(r'^\d+\.\s*', '', str(x)).replace('\r', ' ')
    )

    # Find the row with 'TOTAL' and extract the value from 'Average Generation Cost'
    total_row = df_sliced_columns[df_sliced_columns['Power Supplier'].str.contains('TOTAL', case=False, na=False)]
    generation_charge = total_row['Average Generation Cost'].values[0] if not total_row.empty else None

    # Add the new column "Generation Charge" with the extracted value
    df_new['Generation Charge'] = generation_charge

    # Reset the index of the final dataframe
    df_new.reset_index(drop=True, inplace=True)

    return df_new

# Loop through each PDF file in the folder
for filename in os.listdir('pdf downloads'):
    if filename.endswith('.pdf'):
        file_path = os.path.join('pdf downloads', filename)
        
        # Process the PDF file
        df_processed = process_pdf(file_path)
        
        # Extract date from the filename
        date_str = filename.split('GenBreakdown')[1].replace('.pdf', '')
        date_obj = datetime.strptime(date_str, '%Y-%m')
        formatted_date = date_obj.strftime('%b-%Y')
        
        # Add the "Date" column to the DataFrame
        df_processed['Date'] = formatted_date
        
        # Append the processed DataFrame to big_df
        big_df = pd.concat([big_df, df_processed], ignore_index=True)

# Reset the index of big_df
big_df.reset_index(drop=True, inplace=True)

# Remove rows where '%' column is NaN or null
big_df = big_df.dropna(subset=['%'])

big_df.head()

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge,Date
0,GenPower Mariveles_GMPC,40.94%,12840823,4.2311,4.4093,Aug-2020
1,SMC Cosolidated Power,47.77%,14984765,4.9023,4.4093,Aug-2020
2,WESM,11.28%,3538503,2.968,4.4093,Aug-2020
3,GenPower Mariveles_GMPC,37.62%,11513325,4.1677,4.3211,Sep-2020
4,SMC Cosolidated Power,50.68%,15511663,4.8226,4.3211,Sep-2020


#### Creating Supplier Dataframe

In [32]:
unique_suppliers = big_df['Power Supplier'].unique()

unique_suppliers

array(['GenPower Mariveles_GMPC', 'SMC Cosolidated Power', 'WESM',
       'GenPower Mariveles_GMEC', 'GenPower Dinginin_GMPD',
       'GenPower Dinginin_GMPC', 'Limay Power Inc.',
       'GenPower Dinginin_GnPD'], dtype=object)

In [33]:
# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Supplier', 'Power Supplier ID'])

supplier_df

Unnamed: 0,Power Supplier,Power Supplier ID
0,GenPower Mariveles_GMPC,1
1,SMC Cosolidated Power,2
2,WESM,3
3,GenPower Mariveles_GMEC,4
4,GenPower Dinginin_GMPD,5
5,GenPower Dinginin_GMPC,6
6,Limay Power Inc.,7
7,GenPower Dinginin_GnPD,8


In [34]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Supplier'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Ensure IDs are not converted to floats
#big_df['Power Supplier ID'] = big_df['Power Supplier ID'].astype(int)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Supplier'])

big_df.head()

Unnamed: 0,%,kWh,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,40.94%,12840823,4.2311,4.4093,Aug-2020,1
1,47.77%,14984765,4.9023,4.4093,Aug-2020,2
2,11.28%,3538503,2.968,4.4093,Aug-2020,3
3,37.62%,11513325,4.1677,4.3211,Sep-2020,1
4,50.68%,15511663,4.8226,4.3211,Sep-2020,2


In [35]:
with pd.ExcelWriter("Historical_TARELCO_II_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

#### for troubleshooting loop

In [22]:
df = tabula.read_pdf("pdf downloads/GenBreakdown2024-01.pdf", lattice=True, pages=1)[0]

In [23]:
df

Unnamed: 0,SOURCE,%\rTotal kWh\rPurchased,(A)\rKWh Purchased,(B)\rBasic\rGeneration\rCost\r(Php),"©\rOther Cost\rAdjustment (DAA)\rNSS, and Other Billing\radjustment)/ and Discount\r(PhP)",(D = B+ C)\rTotal Generation\rCost for the\rMonth (Php),(D/A)\rAverage\rGeneration\rCost\r(Php/kWH)
0,NPC-TSC,,,,,,
1,,,,,,,
2,BILATERAL,,,,,,
3,CONTRACTS W/ IPPs,,,,,,
4,1. GenPower Mariveles_GMEC,29.84%,10080743.0,58850533.51,-653644.79,58196888.73,5.7731
5,1. GenPower Dinginin_GnPD,17.50%,5911000.0,37121493.82,-277884.0,36843609.83,6.2331
6,2. Limay Power Inc.,21.74%,7343777.0,53448028.21,-828245.29,52619782.93,7.1652
7,3. WESM,30.70%,10371263.0,51456407.3,,51456407.3,4.9614
8,SELF-GENERATION,,,,,,
9,SALE FOR RESALE,,,,,,


In [24]:
# List of keywords to filter columns
keywords = ["SOURCE", "%", "Purchased", "Average"]

# Filter columns based on whether their names contain any of the keywords
df_sliced_columns = df[[col for col in df.columns if any(keyword in col for keyword in keywords)]]

df_sliced_columns.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]
df_sliced_columns

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,NPC-TSC,,,
1,,,,
2,BILATERAL,,,
3,CONTRACTS W/ IPPs,,,
4,1. GenPower Mariveles_GMEC,29.84%,10080743.0,5.7731
5,1. GenPower Dinginin_GnPD,17.50%,5911000.0,6.2331
6,2. Limay Power Inc.,21.74%,7343777.0,7.1652
7,3. WESM,30.70%,10371263.0,4.9614
8,SELF-GENERATION,,,
9,SALE FOR RESALE,,,


In [26]:
# Check for leading numbers in the first column (assuming 'Power Supplier' is the first column here)
df_filtered_leading_numbers = df_sliced_columns[df_sliced_columns['Power Supplier'].str.contains(r'^\d+', na=False)]

# Skip rows where 'Power Supplier' contains the word "metering"
df_filtered_leading_numbers = df_filtered_leading_numbers[~df_filtered_leading_numbers['Power Supplier'].str.contains('metering', case=False, na=False)]

# Combine both filters
df_new = pd.concat([df_filtered_leading_numbers]).drop_duplicates().copy()

# Remove leading numbers and replace \r with space in the "Power Supplier" column values
df_new['Power Supplier'] = df_new['Power Supplier'].apply(
    lambda x: re.sub(r'^\d+\.\s*', '', str(x)).replace('\r', ' ')
)

# Reset the index of the final dataframe
df_new.reset_index(drop=True, inplace=True)

df_new

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,GenPower Mariveles_GMEC,29.84%,10080743,5.7731
1,GenPower Dinginin_GnPD,17.50%,5911000,6.2331
2,Limay Power Inc.,21.74%,7343777,7.1652
3,WESM,30.70%,10371263,4.9614


In [27]:
# Find the row with 'TOTAL' and extract the value from 'Average Generation Cost'
total_row = df_sliced_columns[df_sliced_columns['Power Supplier'].str.contains('TOTAL', case=False, na=False)]
generation_charge = total_row['Average Generation Cost'].values[0] if not total_row.empty else None

# Add the new column "Generation Charge" with the extracted value
df_new['Generation Charge'] = generation_charge

# Reset the index of the final dataframe
df_new.reset_index(drop=True, inplace=True)

df_new

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge
0,GenPower Mariveles_GMEC,29.84%,10080743,5.7731,5.9071
1,GenPower Dinginin_GnPD,17.50%,5911000,6.2331,5.9071
2,Limay Power Inc.,21.74%,7343777,7.1652,5.9071
3,WESM,30.70%,10371263,4.9614,5.9071
