In [1]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [2]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [3]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
%pip install -q tabula-py
%pip install selenium requests 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

The following lines of code download the files for the breakdown of generation charge from MERALCO's Rate Archives.

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless Chrome (no GUI)
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920x1080')

driver = webdriver.Chrome(options=options)

try:
    driver.get("https://company.meralco.com.ph/news-and-advisories/rates-archives")

    # Click "Show more" until all items are loaded
    while True:
        try:
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[@class='btn btn-bordered load-more' and @title='Go to next page']"))
            )
            show_more_button.click()
            time.sleep(2)  # Wait for content to load
        except Exception as e:
            break  # Break the loop if no more "Show more" button is found

    rows = driver.find_elements(By.XPATH, "//tr[.//span[contains(text(), 'Generation')]]")

    # Prepare the download directory
    os.makedirs('pdf downloads', exist_ok=True)

    for row in rows:
        try:
            # Find the date in the same row
            date_element = row.find_element(By.XPATH, ".//td[contains(@class, 'views-field-field-date-created')]")
            date_text = date_element.text.strip()

            # Convert date text to "mm-yy" format
            match = re.search(r'(\w+)\s+(\d{4})', date_text)
            if match:
                month_str, year_str = match.groups()
                month = {
                    'January': '01', 'February': '02', 'March': '03', 'April': '04',
                    'May': '05', 'June': '06', 'July': '07', 'August': '08',
                    'September': '09', 'October': '10', 'November': '11', 'December': '12'
                }[month_str]
                date_formatted = f"{month}-{year_str[-2:]}"

                # Break the loop if the date is December 2017
                if month_str == 'December' and year_str == '2017':
                    print("Reached December 2017. Stopping downloads.")
                    break
            else:
                print(f"Date format not recognized for text: {date_text}")
                continue  # Skip this row if date format is not recognized

            # Find the PDF link
            pdf_link = row.find_element(By.XPATH, ".//a[contains(@class, 'btn-bordered-orange') and contains(@href, '.pdf')]")
            url = pdf_link.get_attribute('href')

            # Download the PDF
            response = requests.get(url)
            if response.status_code == 200:
                pdf_name = f"gc_table_{date_formatted}.pdf"
                with open(f'pdf downloads/{pdf_name}', 'wb') as f:
                    f.write(response.content)
            else:
                print(f"Failed to download PDF from {url}, status code: {response.status_code}")

        except Exception as e:
            print(f"Error processing row: {e}")

finally:
    driver.quit()

Reached December 2017. Stopping downloads.


In [6]:
#Create a new dataframe for each pdf file.

downloads_dir = "pdf downloads"

dataframes = {}

pdf_files = [f for f in os.listdir(downloads_dir) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    date_part = pdf_file.split('_')[-1].replace('.pdf', '')
    pdf_path = os.path.join(downloads_dir, pdf_file)

    df = tabula.read_pdf(pdf_path, stream=True, pages=1)[0]

    dataframes[f"df_{date_part}"] = df

### **Processing**

In [136]:
def process_rows(df):
    # Assuming df is already defined and contains the relevant data

    # Step 1: Filter the DataFrame
    supplier_index = df[df.iloc[:, 0].str.contains("SUPPLIERS", na=False)].index[0]
    df_new = df.iloc[supplier_index + 1:]

    df_new = df_new.reset_index(drop=True)

    charge_index = df_new[df_new.iloc[:, 0].str.contains("CHARGE", na=False, case=False, regex=False)].index
    if not charge_index.empty:
        charge_index = charge_index[0]
        df_new = df_new.iloc[:charge_index + 1]

    # Reset index for the cleaned DataFrame
    df_new = df_new.reset_index(drop=True)

    # Identify columns based on their content
    col_with_comma = None
    col_with_dot = None
    col_with_percent = None

    # Find the first column with a comma
    for i in range(1, len(df_new.columns)):
        if df_new.iloc[:, i].dtype == object:
            if col_with_comma is None and df_new.iloc[:, i].str.contains(',', na=False).any():
                col_with_comma = i

    # Find the last column with a dot
    for i in range(len(df_new.columns) - 1, -1, -1):
        if df_new.iloc[:, i].dtype == object:
            if df_new.iloc[:, i].str.contains('\.', na=False).any():
                col_with_dot = i
                break

    # Find the last column with a percent sign
    for i in range(len(df_new.columns) - 1, -1, -1):
        if df_new.iloc[:, i].dtype == object:
            if df_new.iloc[:, i].str.contains('%', na=False).any():
                col_with_percent = i
                break

    columns_to_include = [0]  # Always include the first column

    if col_with_comma is not None:
        df_new['kWh'] = df_new.iloc[:, col_with_comma]
        columns_to_include.append(col_with_comma)

    if col_with_dot is not None and col_with_dot not in columns_to_include:
        columns_to_include.append(col_with_dot)

    if col_with_percent is not None:
        df_new['%'] = df_new.iloc[:, col_with_percent]  
        columns_to_include.append(col_with_percent)

    if len(columns_to_include) == 1:
        # If only one column to include, default to the first two columns
        columns_to_include = [0, 1]

    if col_with_comma == col_with_percent:
        columns_to_include = [
            df_new.columns[0],  # First column
            'kWh',  # kWh column
            df_new.columns[col_with_dot],  # Column with dot
            df_new.columns[col_with_percent]  # Column with percent
        ]

        # Adjust the regular expression to handle spaces within the kWh value
        df_new['%'] = df_new.iloc[:, col_with_percent].str.extract(r'(\d+\.\d+%)')

        # Remove spaces within the kWh values
        df_new['kWh'] = df_new.iloc[:, col_with_percent].str.extract(r'\d+\.\d+%\s*(\d\s*[\d,]+)')[0].str.replace(' ', '')

    # Convert values in parentheses to negative numbers
    def convert_parentheses_to_negative(value):
        if isinstance(value, str) and value.startswith('(') and value.endswith(')'):
            return '-' + value[1:-1]
        return value

    df_new = df_new.applymap(convert_parentheses_to_negative)

    df_new = pd.DataFrame({
        'Power Suppliers': df_new.iloc[:, 0],
        'kWh': df_new['kWh'],
        'Average Generation Cost': df_new.iloc[:, col_with_dot],
        '%': df_new['%']
    })
    df_new.columns = ['Power Suppliers', 'kWh', 'Average Generation Cost', '%']

    # Remove numbers in Power Suppliers
    df_new['Power Suppliers'] = df_new['Power Suppliers'].str.replace(r'^\d+\.\s*', '', regex=True)

    # Get the 'Generation Charge' value
    charge_row = df[df.iloc[:, 0].str.contains('CHARGE', case=False, na=False)]

    if not charge_row.empty:
        # Get the value from the last column
        rate_value = charge_row.iloc[:, -1].values[0]

        # Check if the value is NaN
        if pd.isna(rate_value):
            # If NaN, get the value from the second last column
            rate_value = charge_row.iloc[:, -2].values[0]
    else:
        rate_value = None
    
    return df_new

In [137]:
def add_date(df,key):
    df = process_rows(dataframes[key])

    end_part = key.split("_")[1]

    month_map = {
        '01': 'Jan', '02': 'Feb', '03': 'Mar', '04': 'Apr',
        '05': 'May', '06': 'Jun', '07': 'Jul', '08': 'Aug',
        '09': 'Sep', '10': 'Oct', '11': 'Nov', '12': 'Dec'
    }
    month, year = end_part.split('-')
    formatted_date = f"{month_map[month]}-{year}"

    # Add the new column to the dataframe
    df['Date'] = formatted_date

    return(df)

In [138]:
# Initialize an empty list to store the processed dataframes
processed_dfs = []

for key in dataframes:
    print(key)
    df = dataframes[key]
    
    # Process rows and check if the result is not None
    processed_rows_df = process_rows(df)
    if processed_rows_df is None:
        raise ValueError(f"process_rows returned None for key: {key}")
    
    # Add date and check if the result is not None
    processed_df = add_date(processed_rows_df, key)
    if processed_df is None:
        raise ValueError(f"add_date returned None for key: {key}")
    
    processed_dfs.append(processed_df)

# Concatenate all the processed dataframes into one big dataframe
big_df = pd.concat(processed_dfs, ignore_index=True)

df_04-24
df_04-18
df_08-18
df_06-21
df_08-24
df_06-20
df_08-19
df_04-19
df_06-22
df_06-23
df_04-22
df_08-22
df_08-23
df_04-23
df_04-21
df_08-21
df_06-18
df_06-24
df_06-19
df_08-20
df_04-20
df_12-23
df_12-22
df_10-19
df_02-19
df_12-20
df_12-21
df_02-18
df_02-24
df_10-18
df_10-20
df_02-20
df_12-19
df_12-18
df_02-21
df_10-21
df_10-23
df_02-23
df_02-22
df_10-22
df_07-23
df_07-22
df_09-19
df_07-20
df_05-19
df_05-24
df_05-18
df_07-21
df_09-18
df_09-20
df_07-19
df_05-20
df_05-21
df_07-18
df_09-21
df_07-24
df_09-23
df_05-23
df_05-22
df_09-22
df_03-18


  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_paren

df_03-24
df_11-18
df_01-21
df_01-20
df_11-19
df_03-19
df_01-22
df_01-23
df_03-22
df_11-22
df_11-23
df_03-23
df_03-21
df_11-21
df_01-24
df_01-18
df_01-19
df_11-20
df_03-20


  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_parentheses_to_negative)
  df_new = df_new.applymap(convert_paren

In [139]:
big_df['kWh'] = big_df['kWh'].str.replace(r'\s', '', regex=True)   # Remove all spaces
big_df['kWh'] = big_df['kWh'].str.replace(r'^\d+\.?\d*%', '', regex=True)  # Remove percentage and any preceding characters

big_df['Power Suppliers'] = big_df['Power Suppliers'].str.replace(r'\s*\d+\.?\d*%\s*$', '', regex=True)

big_df = big_df[~big_df['Power Suppliers'].str.contains("Subtotal", na=False, case=False)]

big_df.head(20)

Unnamed: 0,Power Suppliers,kWh,Average Generation Cost,%,Date
0,Quezon Power Phils Ltd. Co. (QPPL),73940954,17.6352,2.5%,Apr-24
1,First Gas Power Corporation (FGPC) – Santa Rita,499668979,6.7554,17.1%,Apr-24
2,FGP Corp. (FGP) – San Lorenzo,274353355,6.5367,9.4%,Apr-24
4,First NatGas Power Corp. (FNPC) - San Gabriel,-,-,0.0%,Apr-24
5,San Buenaventura Power Ltd. Co. (SBPL),286480000,6.9822,9.8%,Apr-24
6,Solar Philippines Tarlac Corp. (SPTC),11478701,3.3122,0.4%,Apr-24
7,AC Energy (baseload),139200000,3.9240,4.8%,Apr-24
8,AC Energy (midmerit),45936000,4.4442,1.6%,Apr-24
9,Sual Power Inc (SPI) (midmerit),106720000,5.1630,3.7%,Apr-24
10,Energy Development Corporation (EDC) (midmerit),28385000,5.5294,1.0%,Apr-24


### Creating Supplier Dataframe

In [140]:
unique_suppliers = big_df['Power Suppliers'].unique()

# Create a mapping of power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(unique_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Suppliers', 'Power Supplier ID'])

supplier_df.head()

Unnamed: 0,Power Suppliers,Power Supplier ID
0,Quezon Power Phils Ltd. Co. (QPPL),1
1,First Gas Power Corporation (FGPC) – Santa Rita,2
2,FGP Corp. (FGP) – San Lorenzo,3
3,First NatGas Power Corp. (FNPC) - San Gabriel,4
4,San Buenaventura Power Ltd. Co. (SBPL),5


In [142]:
unique_suppliers

array(['Quezon Power Phils Ltd. Co. (QPPL)',
       'First Gas Power Corporation (FGPC) – Santa Rita',
       'FGP Corp. (FGP) – San Lorenzo',
       'First NatGas Power Corp. (FNPC) - San Gabriel',
       'San Buenaventura Power Ltd. Co. (SBPL)',
       'Solar Philippines Tarlac Corp. (SPTC)', 'AC Energy (baseload)',
       'AC Energy (midmerit)', 'Sual Power Inc (SPI) (midmerit)',
       'Energy Development Corporation (EDC) (midmerit)',
       'Powersource First Bulacan Solar Inc. (PFBS)',
       'Solar Philippines Tanauan Corp. (SPTanC)',
       'South Premiere Power Corp. (SPPC) EPSA-1',
       'Therma Luzon Inc (TLI) EPSA',
       'South Premiere Power Corp. (SPPC) EPSA-2',
       'Wholesale Electricity Spot Market (WESM)',
       'Export Energy from Net Metering Customers', 'Others*', 'TOTAL',
       'Other Generation Adjustments (OGA)', 'Pilferage Recovery',
       'ILP Recovery', 'High Load Factor Rider', 'TOU Differential',
       'APRIL 2024 GENERATION CHARGE', 'SEM-Calaca P

In [12]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Suppliers'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Suppliers'].map(supplier_mapping)

# Ensure IDs are not converted to floats
#big_df['Power Supplier ID'] = big_df['Power Supplier ID'].astype(int)

# Drop the old Power Suppliers column
big_df = big_df.drop(columns=['Power Suppliers'])

big_df.head()

Unnamed: 0,kWh,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,73940954,17.6352,6.3889,Apr-24,1
1,499668979,6.7554,6.3889,Apr-24,2
2,274353355,6.5367,6.3889,Apr-24,3
3,-,-,6.3889,Apr-24,4
4,286480000,6.9822,6.3889,Apr-24,5


In [13]:
%pip install openpyxl

with pd.ExcelWriter("Historical_MERALCO_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

Note: you may need to restart the kernel to use updated packages.


### **Troubleshooting**

In [121]:
df = tabula.read_pdf("pdf downloads/gc_table_04-24.pdf", stream=True, pages=1)[0]

In [122]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,(C),Unnamed: 4,Unnamed: 5
0,,,,,(D = B+ C),,[D/A]
1,,,,(B),Other Cost,,
2,,% of Total,(A),,Total,,Average
3,,,,Basic,Adjustments (NSS,,
4,Source,kWh,kWh,,Generation Cost for,,Generation
5,,,,Generation Cost,and Other Billing,,
6,,Purchased,Purchased,,the Month,,Cost
7,,,,(PhP),Adjustments),,
8,,,,,(PhP),,(PhP/kWh)
9,,,,,(PhP),,


In [123]:
# Assuming df is already defined and contains the relevant data

# Step 1: Filter the DataFrame
supplier_index = df[df.iloc[:, 0].str.contains("SUPPLIERS", na=False)].index[0]
df_new = df.iloc[supplier_index + 1:]

df_new = df_new.reset_index(drop=True)

charge_index = df_new[df_new.iloc[:, 0].str.contains("CHARGE", na=False, case=False, regex=False)].index
if not charge_index.empty:
    charge_index = charge_index[0]
    df_new = df_new.iloc[:charge_index + 1]

# Reset index for the cleaned DataFrame
df_new = df_new.reset_index(drop=True)

# Identify columns based on their content
col_with_comma = None
col_with_dot = None
col_with_percent = None

# Find the first column with a comma
for i in range(1, len(df_new.columns)):
    if df_new.iloc[:, i].dtype == object:
        if col_with_comma is None and df_new.iloc[:, i].str.contains(',', na=False).any():
            col_with_comma = i

# Find the last column with a dot
for i in range(len(df_new.columns) - 1, -1, -1):
    if df_new.iloc[:, i].dtype == object:
        if df_new.iloc[:, i].str.contains('\.', na=False).any():
            col_with_dot = i
            break

# Find the last column with a percent sign
for i in range(len(df_new.columns) - 1, -1, -1):
    if df_new.iloc[:, i].dtype == object:
        if df_new.iloc[:, i].str.contains('%', na=False).any():
            col_with_percent = i
            break

columns_to_include = [0]  # Always include the first column

if col_with_comma is not None:
    df_new['kWh'] = df_new.iloc[:, col_with_comma]
    columns_to_include.append(col_with_comma)

if col_with_dot is not None and col_with_dot not in columns_to_include:
    columns_to_include.append(col_with_dot)

if col_with_percent is not None:
    df_new['%'] = df_new.iloc[:, col_with_percent]  
    columns_to_include.append(col_with_percent)

if len(columns_to_include) == 1:
    # If only one column to include, default to the first two columns
    columns_to_include = [0, 1]

if col_with_comma == col_with_percent:
    columns_to_include = [
        df_new.columns[0],  # First column
        'kWh',  # kWh column
        df_new.columns[col_with_dot],  # Column with dot
        df_new.columns[col_with_percent]  # Column with percent
    ]

    # Adjust the regular expression to handle spaces within the kWh value
    df_new['%'] = df_new.iloc[:, col_with_percent].str.extract(r'(\d+\.\d+%)')

    # Remove spaces within the kWh values
    df_new['kWh'] = df_new.iloc[:, col_with_percent].str.extract(r'\d+\.\d+%\s*(\d\s*[\d,]+)')[0].str.replace(' ', '')

# Convert values in parentheses to negative numbers
def convert_parentheses_to_negative(value):
    if isinstance(value, str) and value.startswith('(') and value.endswith(')'):
        return '-' + value[1:-1]
    return value

df_new = df_new.applymap(convert_parentheses_to_negative)

df_new = pd.DataFrame({
    'Power Suppliers': df_new.iloc[:, 0],
    'kWh': df_new['kWh'],
    'Average Generation Cost': df_new.iloc[:, col_with_dot],
    '%': df_new['%']
})
df_new.columns = ['Power Suppliers', 'kWh', 'Average Generation Cost', '%']

# Remove numbers in Power Suppliers
df_new['Power Suppliers'] = df_new['Power Suppliers'].str.replace(r'^\d+\.\s*', '', regex=True)

# Get the 'Generation Charge' value
charge_row = df[df.iloc[:, 0].str.contains('CHARGE', case=False, na=False)]

if not charge_row.empty:
    # Get the value from the last column
    rate_value = charge_row.iloc[:, -1].values[0]

    # Check if the value is NaN
    if pd.isna(rate_value):
        # If NaN, get the value from the second last column
        rate_value = charge_row.iloc[:, -2].values[0]
else:
    rate_value = None

df_new['Generation Charge'] = rate_value

end_part = key.split("_")[1]

month_map = {
    '01': 'Jan', '02': 'Feb', '03': 'Mar', '04': 'Apr',
    '05': 'May', '06': 'Jun', '07': 'Jul', '08': 'Aug',
    '09': 'Sep', '10': 'Oct', '11': 'Nov', '12': 'Dec'
}
month, year = end_part.split('-')
formatted_date = f"{month_map[month]}-{year}"

# Add the new column to the dataframe
df_new['Date'] = formatted_date

df_new

  df_new = df_new.applymap(convert_parentheses_to_negative)


Unnamed: 0,Power Suppliers,kWh,Average Generation Cost,%,Generation Charge,Date
0,Quezon Power Phils Ltd. Co. (QPPL),"7 3,940,954",17.6352,2.5%,6.3889,Apr-24
1,First Gas Power Corporation (FGPC) – Santa Rita,499668979,6.7554,17.1%,6.3889,Apr-24
2,FGP Corp. (FGP) – San Lorenzo,274353355,6.5367,9.4%,6.3889,Apr-24
3,Subtotal - Independent Power Producers (IPPs),847963288,7.6334,29.0%,6.3889,Apr-24
4,First NatGas Power Corp. (FNPC) - San Gabriel,-,-,0.0%,6.3889,Apr-24
5,San Buenaventura Power Ltd. Co. (SBPL),286480000,6.9822,9.8%,6.3889,Apr-24
6,Solar Philippines Tarlac Corp. (SPTC),"1 1,478,701",3.3122,0.4%,6.3889,Apr-24
7,AC Energy (baseload),139200000,3.9240,4.8%,6.3889,Apr-24
8,AC Energy (midmerit),"4 5,936,000",4.4442,1.6%,6.3889,Apr-24
9,Sual Power Inc (SPI) (midmerit),106720000,5.1630,3.7%,6.3889,Apr-24


None


  df_new = df_new.applymap(convert_parentheses_to_negative)
