In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [99]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

In [145]:
import requests

url = "https://www.leyecoiv.com/rates/breakdown-of-generation-charges/"

# Send a GET request to the webpage
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the content of the webpage
    html_content = response.text

    # Save the content to a file
    with open("leyeco_iv_webpage.html", "w", encoding="utf-8") as file:
        file.write(html_content)
    print("Webpage source code downloaded and saved as leyeco_iv_webpage.html")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Webpage source code downloaded and saved as leyeco_iv_webpage.html


In [154]:
import re

# Read the HTML content from the file
with open('leyeco_iv_webpage.html', 'r') as file:
    html_content = file.read()

# Find all href links that match the specified format
pattern = r'href=["\'](https://www\.leyecoiv\.com/\d{4}/\d{2}/\d{2}/[^"\']*?\d{4}/)["\']'
links = re.findall(pattern, html_content)

# Display the extracted links
unique_links = set(links)

# Iterate over the unique links and print each one
for link in unique_links:
    print(link)

https://www.leyecoiv.com/2022/12/20/december-2022/
https://www.leyecoiv.com/2022/10/18/october-2022/
https://www.leyecoiv.com/2020/01/10/january-2020/
https://www.leyecoiv.com/2023/02/15/february-2023/
https://www.leyecoiv.com/2021/03/11/march-2021/
https://www.leyecoiv.com/2018/03/07/march-2018/
https://www.leyecoiv.com/2022/11/20/november-2022/
https://www.leyecoiv.com/2020/04/10/april-2020/
https://www.leyecoiv.com/2018/10/09/october-2018/
https://www.leyecoiv.com/2020/09/10/september-2020/
https://www.leyecoiv.com/2019/05/09/may-2019/
https://www.leyecoiv.com/2020/03/10/march-2020/
https://www.leyecoiv.com/2018/10/09/november-2018/
https://www.leyecoiv.com/2021/08/11/august-2021/
https://www.leyecoiv.com/2018/01/07/january-2018/
https://www.leyecoiv.com/2018/05/07/may-2018/
https://www.leyecoiv.com/2022/09/01/july-2022/
https://www.leyecoiv.com/2019/09/09/september-2019/
https://www.leyecoiv.com/2022/09/01/august-2022/
https://www.leyecoiv.com/2021/02/11/february-2021/
https://www.

In [164]:
import warnings
from bs4 import BeautifulSoup
import re
import requests

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Function to fetch and store a table from a given URL
def fetch_table(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')
        
        if table:
            df = pd.read_html(str(table))[0]
            return df
        else:
            print(f"No table found at {url}")
            return None
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract date key (MM-YY) from the URL
def extract_date_key(url):
    # First, look for a pattern with a month name and year at the end of the URL
    match = re.search(r'/([a-zA-Z]+)-(\d{4})/', url)
    if match:
        month_name = match.group(1).lower()  # Extract month name
        year = match.group(2)[-2:]  # Get last two digits of the year

        # Convert the month name to its corresponding number (01 for January, etc.)
        month_map = {
            "january": "01", "february": "02", "march": "03",
            "april": "04", "may": "05", "june": "06",
            "july": "07", "august": "08", "september": "09",
            "october": "10", "november": "11", "december": "12"
        }

        month = month_map.get(month_name)
        if month:
            return f"{month}-{year}"
    
    # Fallback to the original pattern if no month name is found
    match = re.search(r'/(\d{4})/(\d{2})/\d{2}/', url)
    if match:
        year = match.group(1)[-2:]  # Get last two digits of the year
        month = match.group(2)
        return f"{month}-{year}"
    
    return None

# Iterate over unique links and store tables in DataFrames with date keys
dataframes = {}

for link in unique_links:
    date_key = extract_date_key(link)
    if date_key:
        df = fetch_table(link)
        if df is not None:
            dataframes[date_key] = df

### **Processing**

In [171]:
# Function to find the column name where a keyword is present in any of its rows
def find_column_by_keyword(df, keyword):
    mask = df.apply(lambda col: col.str.contains(keyword, case=False, na=False)).any()
    return df.columns[mask]

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    # Replace "IEMOP" with "WESM"
    value = value.replace("IEMOP", "WESM")
    return value

# Function to convert kWh values with parentheses to negative numbers
def convert_parentheses_to_negative(value):
    if isinstance(value, str) and re.match(r'^\(.*\)$', value):
        value = '-' + value.strip('()')
    return value

# Initialize an empty DataFrame to store the results
big_df = pd.DataFrame()

# Loop through the DataFrames in the dictionary
for date_key, df in dataframes.items():
    # Identify the columns that contain the specified keywords in any of their rows
    columns_to_include = [
        find_column_by_keyword(df, 'CONTRACTS')[0],
        find_column_by_keyword(df, '% to')[0],
        find_column_by_keyword(df, 'Purchased')[1],  # Get the second column with "Purchased"
        find_column_by_keyword(df, 'Average')[0],
    ]
    
    # Create a new DataFrame with the selected columns
    sliced_cols_df = df[columns_to_include]
    
    # Find the index of the row containing the word "CONTRACTS" in the first column
    start_index = sliced_cols_df[sliced_cols_df.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index[0] + 1
    
    # Find the index of the row containing the word "IEMOP" or "WESM" in the first column
    end_index = sliced_cols_df[
        sliced_cols_df.iloc[:, 0].str.contains(r'WESM|IEMOP|INDEPENDENT ELECTRICITY MARKET OPERATOR', case=False, na=False)
    ].index[0]
    
    # Slice the DataFrame to include rows between start_index and end_index (inclusive)
    sliced_rows_df = sliced_cols_df.iloc[start_index:end_index + 1]
    
    # Exclude the row that contains the word "Metering" in the first column
    sliced_rows_df = sliced_rows_df[~sliced_rows_df.iloc[:, 0].str.contains('Metering', case=False, na=False)]
    
    # Rename the columns
    sliced_rows_df.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

    # Drop rows where the first column (Power Supplier) is null
    sliced_rows_df = sliced_rows_df.dropna(subset=[sliced_rows_df.columns[0]])

    # Apply the cleaning function to the first column of sliced_df
    sliced_rows_df.iloc[:, 0] = sliced_rows_df.iloc[:, 0].apply(clean_power_supplier)
    
    # Convert kWh values with parentheses to negative numbers
    sliced_rows_df['kWh'] = sliced_rows_df['kWh'].apply(convert_parentheses_to_negative)
    sliced_rows_df['Average Generation Cost'] = sliced_rows_df['Average Generation Cost'].apply(convert_parentheses_to_negative)
    
    # Reset the index
    sliced_rows_df = sliced_rows_df.reset_index(drop=True)
    
    # Find the column that contains the word "Average"
    average_column = sliced_cols_df.columns[sliced_cols_df.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]
    
    # Find the row where the first column contains the word "TOTAL"
    total_rows = sliced_cols_df.loc[sliced_cols_df.iloc[:, 0] == "TOTAL", average_column]

    if not total_rows.empty:
        # If "TOTAL" is found, get the last occurrence
        total_row_value = total_rows.iloc[-1]
    else:
        # If "TOTAL" is not found, search for the row containing "GENERATION CHARGE"
        total_row_value = sliced_cols_df.loc[
            sliced_cols_df.iloc[:, 0].str.contains("GENERATION CHARGE", case=False, na=False),
            average_column
        ].iloc[-1]

    # If Generation Charge is still null, try to get the value from the column before the "Average" column
    if pd.isna(total_row_value):
        prev_column = df.columns[df.columns.get_loc(average_column) - 1]
        total_row_value = df.iloc[-1][prev_column]

    # Add a new column "Generation Charge" with this value
    sliced_rows_df['Generation Charge'] = total_row_value
    
    # Add a new column "Date" with the date_key value (formatted as 'Month-Year')
    month, year = date_key.split('-')
    sliced_rows_df['Date'] = pd.to_datetime(f'20{year}-{month}-01').strftime('%b-%Y')
    
    # Append the result to big_df
    big_df = pd.concat([big_df, sliced_rows_df], ignore_index=True)

# Display the final combined DataFrame
big_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge,Date
0,GNPD,115%,8877900.0,8.2662,8.8636,Dec-2022
1,WESM,-15%,-1171489.0,3.5367,8.8636,Dec-2022
2,GNPD,125%,10655000.0,11.3173,11.8258,Oct-2022
3,WESM,-25%,-2153608.0,6.6169,11.8258,Oct-2022
4,GMCP,33%,2341709.0,4.8026,3.7636,Jan-2020
5,WESM,15%,1067489.0,0.9605,3.7636,Jan-2020
6,GNPD,110%,7877743.0,8.1438,8.375,Feb-2023
7,WESM,-10%,-743660.0,0.4309,8.375,Feb-2023
8,GNPD,108%,7916000.0,4.8451,5.0442,Mar-2021
9,WESM,-8%,-609227.0,0.7673,5.0442,Mar-2021


#### Creating Supplier Dataframe

In [172]:
unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers

array(['GNPD', 'WESM', 'GMCP', 'SMEC', 'GCGI',
       'GNPower Dinginin Ltd. Co.',
       'INDEPENDENT ELECTRICITY MARKET OPERATOR'], dtype=object)

In [173]:
big_df.replace("INDEPENDENT ELECTRICITY MARKET OPERATOR", "WESM", inplace=True)
big_df.replace("GNPower Dinginin Ltd. Co.", "GNPD", inplace=True)
unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers

array(['GNPD', 'WESM', 'GMCP', 'SMEC', 'GCGI'], dtype=object)

In [174]:
# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Suppliers', 'Power Supplier ID'])

supplier_df.head()

Unnamed: 0,Power Suppliers,Power Supplier ID
0,GNPD,1
1,WESM,2
2,GMCP,3
3,SMEC,4
4,GCGI,5


In [175]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Suppliers'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Supplier'])

big_df.head()

Unnamed: 0,%,kWh,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,115%,8877900,8.2662,8.8636,Dec-2022,1
1,-15%,-1171489,3.5367,8.8636,Dec-2022,2
2,125%,10655000,11.3173,11.8258,Oct-2022,1
3,-25%,-2153608,6.6169,11.8258,Oct-2022,2
4,33%,2341709,4.8026,3.7636,Jan-2020,3


In [176]:
%pip install openpyxl

with pd.ExcelWriter("Historical_LEYECO_IV_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

Note: you may need to restart the kernel to use updated packages.


#### for troubleshooting loop

In [165]:
df = dataframes["01-22"]
df

Unnamed: 0,0,1,2,3,4,5,6
0,,,(A),(B),(C),(D = B + C),[D / A]
1,SOURCE,% to Total kWh Purchased,kWh Purchased,Basic Generation Cost (Php),"Other Cost Adjustments (DAA, NSS, & Other Bill...",Total Generation Cost for the Month (Php),Average Generation Cost (Php/kWh)
2,NPC-TSC,,,,,,
3,BILATERAL CONTRACTS w/ IPPS,,,,,,
4,1. GNPD,104%,5555404,35822718.87,,35822718.87,6.4483
5,IEMOP,-4%,"(202,725)","(93,910.94)","(1,162,958.08)","(1,256,869.02)",6.1999
6,SELF-GENERATION,,,,,,
7,SALE FOR RESALE,,,,,,
8,OTHERS,,,,,,
9,1. Prompt Payment Discount (Net),,,,"(771,020.30)","(771,020.30)",


In [166]:
# Function to find the column name where a keyword is present in any of its rows
def find_column_by_keyword(df, keyword):
    mask = df.apply(lambda col: col.str.contains(keyword, case=False, na=False)).any()
    return df.columns[mask]

# Identifying the columns that contain the specified keywords in any of their rows
columns_to_include = [
    find_column_by_keyword(df, 'CONTRACTS')[0],
    find_column_by_keyword(df, '% to')[0],
    find_column_by_keyword(df, 'Purchased')[1],  # Get the second column with "Purchased"
    find_column_by_keyword(df, 'Average')[0],
]

# Creating a new DataFrame with the selected columns
sliced_cols_df = df[columns_to_include]

sliced_cols_df

Unnamed: 0,0,1,2,6
0,,,(A),[D / A]
1,SOURCE,% to Total kWh Purchased,kWh Purchased,Average Generation Cost (Php/kWh)
2,NPC-TSC,,,
3,BILATERAL CONTRACTS w/ IPPS,,,
4,1. GNPD,104%,5555404,6.4483
5,IEMOP,-4%,"(202,725)",6.1999
6,SELF-GENERATION,,,
7,SALE FOR RESALE,,,
8,OTHERS,,,
9,1. Prompt Payment Discount (Net),,,


In [167]:
# Find the index of the row containing the word "Supply" in the first column
start_index = sliced_cols_df[sliced_cols_df.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index[0] + 1

# Find the index of the row containing the word "Market" in the first column
end_index = sliced_cols_df[
    sliced_cols_df.iloc[:, 0].str.contains(r'WESM|IEMOP|INDEPENDENT ELECTRICITY MARKET OPERATOR', case=False, na=False)
].index[0]

# Slice the DataFrame to include rows between start_index and end_index (inclusive)
sliced_rows_df = sliced_cols_df.iloc[start_index:end_index + 1]

# Exclude the row that contains the word "Metering" in the first column
sliced_rows_df = sliced_rows_df[~sliced_rows_df.iloc[:, 0].str.contains('Metering', case=False, na=False)]

sliced_rows_df.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
4,1. GNPD,104%,5555404,6.4483
5,IEMOP,-4%,"(202,725)",6.1999


In [168]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_df
sliced_rows_df.iloc[:, 0] = sliced_rows_df.iloc[:, 0].apply(clean_power_supplier)

sliced_rows_df = sliced_rows_df.reset_index(drop=True)
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,GNPD,104%,5555404,6.4483
1,IEMOP,-4%,"(202,725)",6.1999


In [169]:
# Find the column that contains the word "Average"
average_column = sliced_cols_df.columns[sliced_cols_df.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]

# Find the row where the first column contains the word "TOTAL"
# Try to find the last instance of the row containing "TOTAL" in the first column
total_rows = sliced_cols_df.loc[sliced_cols_df.iloc[:, 0] == "TOTAL", average_column]

if not total_rows.empty:
    # If "TOTAL" is found, get the last occurrence
    total_row_value = total_rows.iloc[-1]
else:
    # If "TOTAL" is not found, search for the row containing "GENERATION CHARGE"
    total_row_value = sliced_cols_df.loc[
        sliced_cols_df.iloc[:, 0].str.contains("GENERATION CHARGE", case=False, na=False),
        average_column
    ].iloc[-1]

# Add a new column "Generation Charge" with this value
sliced_rows_df['Generation Charge'] = total_row_value

# Display the updated DataFrame
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge
0,GNPD,104%,5555404,6.4483,6.1922
1,IEMOP,-4%,"(202,725)",6.1999,6.1922
