In [122]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [123]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [124]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [125]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

In [126]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

url = "https://morepower.com.ph/monthly-rates/"
pdf_folder = "pdf downloads"

# Create the folder if it doesn't exist
os.makedirs(pdf_folder, exist_ok=True)

# Define headers to include in the request
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send a GET request to the webpage with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links that end with .pdf
    pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
    
    # Download each PDF
    for link in pdf_links:
        pdf_url = urljoin(url, link)  # Create the full URL if it's relative
        pdf_filename = os.path.join(pdf_folder, os.path.basename(pdf_url))
        
        # Send a GET request to the PDF URL
        pdf_response = requests.get(pdf_url, headers=headers)
        
        if pdf_response.status_code == 200:
            # Save the PDF content
            with open(pdf_filename, "wb") as pdf_file:
                pdf_file.write(pdf_response.content)
            print(f"Downloaded {pdf_filename}")
        else:
            print(f"Failed to download {pdf_url}. Status code: {pdf_response.status_code}")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Downloaded pdf downloads/Generation-Charge_July2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2024_Web-Upload_revised0618.pdf
Downloaded pdf downloads/Generation-Charge_May-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Apr-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Mar-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Feb-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Jan-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Dec-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Nov-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Oct-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Sep-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_August-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_July-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge-f

### **Processing**

In [390]:
# Initialize dictionaries to store keys for DataFrames with and without "Supplier"
supplier_present_keys = {}
supplier_absent_keys = {}

# Loop through all PDF files in the folder
for filename in os.listdir(pdf_folder_path):
    try:
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder_path, filename)

            # Read all tables from the specified page(s)
            tables = tabula.read_pdf(pdf_path, lattice=True, pages='all', multiple_tables=True)

            # Ensure there is at least one table
            if not tables:
                raise ValueError(f"No tables found in the PDF: {filename}")

            # Process the first table (assuming it has headers)
            first_table = tables[0]

            # Extract column names from the first table
            first_table_columns = first_table.columns

            # Initialize a list to store processed DataFrames
            processed_tables = [first_table]

            # Process each subsequent table
            for table in tables[1:]:
                # Convert the header row of the current table to a DataFrame row
                table_header_as_row = pd.DataFrame([table.columns.tolist()], columns=table.columns)
                
                # Append the header row to the current table data
                table_with_header_as_row = pd.concat([table_header_as_row, table], ignore_index=True)

                # Check if the number of columns matches
                if len(table_with_header_as_row.columns) != len(first_table_columns):
                    # Align columns by adding missing columns with NaN values
                    missing_cols = len(first_table_columns) - len(table_with_header_as_row.columns)
                    if missing_cols > 0:
                        # Add missing columns with NaN values
                        table_with_header_as_row = pd.concat(
                            [table_with_header_as_row, pd.DataFrame(columns=[f"Column_{i+1}" for i in range(missing_cols)])], 
                            axis=1
                        )
                    elif missing_cols < 0:
                        # Truncate extra columns
                        table_with_header_as_row = table_with_header_as_row.iloc[:, :len(first_table_columns)]
                
                # Rename columns of the current table to match the first table
                table_with_header_as_row.columns = first_table_columns
                
                # Append the table to the list of processed tables
                processed_tables.append(table_with_header_as_row)

            # Combine all tables into a single DataFrame, stacking vertically
            df = pd.concat(processed_tables, ignore_index=True)

            # Check if any of the column headers contain the word "Supplier"
            if any('Supplier' in col or 'Power' in col for col in df.columns):
                # Store the key in supplier_present_keys dictionary
                supplier_present_keys[filename] = df
            else:
                # Store the key in supplier_absent_keys dictionary
                supplier_absent_keys[filename] = df

    except Exception as e:
        print(f"Error processing file {filename}: {e}")

# Output the keys for verification
print("Files with 'Supplier' in headers:", supplier_present_keys.keys())
print("Files without 'Supplier' in headers:", supplier_absent_keys.keys())

Files with 'Supplier' in headers: dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf',

#### for dataframes w/ column headers

In [408]:
supplier_present_keys.keys()

dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf', 'Generation-Charge-for-August-202

In [412]:
dataframes = []

def extract_date_key(filename):
    # Define patterns for different filename formats
    patterns = [
        r'(\w+)-(\d{4})',        # Matches "Month-YYYY" (e.g., "June-2024")
        r'(\d{4})-(\d{2})',      # Matches "YYYY-MM" (e.g., "2024-06")
        r'(\d{2})-(\d{2})',      # Matches "MM-YY" (e.g., "06-24")
        r'(\d{4})(\d{2})',       # Matches "YYYYMM" (e.g., "202406")
        r'(\d{2})-(\d{4})',      # Matches "MM-YYYY" (e.g., "06-2024")
        r'(\d{4})-(\w+)',        # Matches "YYYY-Month" (e.g., "2024-July")
        r'(\w+)(\d{4})'          # Matches "MonthYYYY" (e.g., "July2024")
    ]
    
    # Define month abbreviation to number mapping
    month_abbr_to_num = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12',
        'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
        'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
        'November': '11', 'December': '12'
    }
    
    # Define month number to abbreviation mapping
    num_to_month_abbr = {v: k for k, v in month_abbr_to_num.items()}
    
    # Extract text after the first underscore and remove file extension
    parts = filename.split('_', 1)
    filename = parts[1] if len(parts) > 1 else filename
    filename = re.sub(r'\.pdf$', '', filename)
    
    # Try to find a match with each pattern
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            groups = match.groups()
            if len(groups) == 2:
                part1, part2 = groups
                if part1.isdigit() and len(part1) == 4:
                    # Handle YYYY-MM or YYYYMM formats
                    if len(part2) == 2 and part2.isdigit():
                        # Format: YYYY-MM or YYYYMM
                        month_abbr = num_to_month_abbr.get(part2)
                        if month_abbr:
                            return f"{month_abbr}-{part1}"  # MMM-YYYY format
                        else:
                            raise ValueError(f"Invalid month in filename: {filename}")
                    else:
                        raise ValueError(f"Unexpected part2 format in filename: {filename}")
                elif len(part1) == 3 and part2.isdigit() and len(part2) == 4:
                    # Handle Month-YYYY format
                    month_abbr = month_abbr_to_num.get(part1)
                    if month_abbr:
                        return f"{month_abbr}-{part2}"  # MMM-YYYY format
                    else:
                        raise ValueError(f"Invalid month abbreviation in filename: {filename}")
                else:
                    raise ValueError(f"Unexpected format in filename: {filename}")
            elif len(groups) == 3:
                month_abbr, year = groups[1], groups[2]
                # Handle Month-YYMM format
                month_num = month_abbr_to_num.get(month_abbr)
                if month_num:
                    return f"{month_num}-{year[:4]}"  # MMM-YYYY format
                else:
                    raise ValueError(f"Invalid month abbreviation in filename: {filename}")
            elif len(groups) == 1 and groups[0] in month_abbr_to_num:
                # Handle MonthYYYY format
                month_abbr = month_abbr_to_num[groups[0]]
                return f"{month_abbr}-{year}"  # MMM-YYYY format

    # If no pattern matches, return an error
    raise ValueError(f"Date not found in filename: {filename}")


def process_dataframe(df, date):
    # Slicing columns
    keywords = ["Power", "%", "Purchased", "Total"]
    filtered_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    sliced_columns = df[filtered_columns]
    sliced_columns.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]

    # Slicing rows
    first_non_null_idx = sliced_columns[sliced_columns.iloc[:, 1].notna()].index[0]
    wesm_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("WESM", na=False, case=False)].index
    total_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.strip().eq("TOTAL")].index

    if not wesm_row_idx.empty and not total_row_idx.empty:
        wesm_row_idx = wesm_row_idx[0]
        total_row_idx = total_row_idx[0]
        sliced_rows = sliced_columns.loc[first_non_null_idx:wesm_row_idx]
        if total_row_idx > wesm_row_idx:
            total_row = sliced_columns.loc[[total_row_idx]]
            sliced_rows = pd.concat([sliced_rows, total_row], ignore_index=True)

    sliced_rows.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]

    # Cleaning "Power Supplier" column
    def clean_power_supplier(value):
        if pd.isnull(value):
            return value
        value = re.sub(r'^\d+\.\s*', '', value)
        return value

    sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)
    sliced_rows = sliced_rows.dropna(subset=[sliced_rows.columns[0]]).reset_index(drop=True)

    # Calculate "Average Generation Cost"
    total_row = sliced_columns[sliced_columns.iloc[:, 0].str.contains("TOTAL", na=False)]
    if not total_row.empty:
        total_generation_cost_str = total_row['Total Generation Cost'].values[0]
        kwh_purchased_str = total_row['kWh'].values[0]
        total_generation_cost = float(total_generation_cost_str.replace(',', ''))
        kwh_purchased = float(kwh_purchased_str.replace(',', ''))

        def calculate_average_generation_cost(row):
            try:
                total_cost = float(row['Total Generation Cost'].replace(',', ''))
                kwh = float(row['kWh'].replace(',', ''))
                return total_cost / kwh
            except (ValueError, TypeError):
                return np.nan

        sliced_rows['Average Generation Cost'] = sliced_rows.apply(calculate_average_generation_cost, axis=1)
        sliced_rows = sliced_rows.drop(columns=['Total Generation Cost'])

    # Add "Generation Charge" and remove "TOTAL" row
    total_row_idx = sliced_rows[sliced_rows.iloc[:, 0] == "TOTAL"].index[0]
    generation_charge_value = sliced_rows.at[total_row_idx, "Average Generation Cost"]
    sliced_rows["Generation Charge"] = generation_charge_value
    sliced_rows = sliced_rows.drop(total_row_idx)

    # Add the "Date" column
    sliced_rows["Date"] = date

    return sliced_rows

# Initialize an empty DataFrame to store all results
big_df = pd.DataFrame()

for filename, df in supplier_present_keys.items():
    try:
        # Extract the date from the filename
        date = extract_date_key(filename)
        
        # Process the DataFrame
        processed_df = process_dataframe(df, date)
        
        # Append the processed DataFrame to big_df
        big_df = pd.concat([big_df, processed_df], ignore_index=True)
    
    except Exception as e:
        print(f"Error processing {filename}: {e}")

big_df

Error processing Generation-Charge_June-2024_Web-Upload_revised0618.pdf: Unexpected format in filename: June-2024_Web-Upload_revised0618
Error processing Generation-Charge_July2024_Web-Upload.pdf: Unexpected format in filename: July2024_Web-Upload
Error processing Generation-Charge-for-June-2022.pdf: Unexpected format in filename: Generation-Charge-for-June-2022
Error processing Generation-Charge-for-July-2022.pdf: Unexpected format in filename: Generation-Charge-for-July-2022
Error processing Generation-Charge-for-November-2022.pdf: Unexpected format in filename: Generation-Charge-for-November-2022
Error processing Generation-Charge-for-November-2021.pdf: Unexpected format in filename: Generation-Charge-for-November-2021
Error processing Generation-Charge-for-March-2023.pdf: Unexpected format in filename: Generation-Charge-for-March-2023
Error processing Generation-Charge-for-March-2022.pdf: Unexpected format in filename: Generation-Charge-for-March-2022
Error processing Generation-Ch

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge,Date
0,Sem Calaca Power Corp.,30.01%,16882500.00,4.916333,5.270779,01-2024
1,KEPCO SPC Power Corp.,19.99%,11245000.00,5.998242,5.270779,01-2024
2,Energy Development Corp,16.64%,9360000.00,6.083,5.270779,01-2024
3,Spot Market (WESM),33.12%,18628682.37,4.772179,5.270779,01-2024
4,Sem Calaca Power Corp.,32.39%,18500000.00,4.81718,5.904401,09-2023
5,KEPCO SPC Power Corp.,20.06%,11456000.00,7.039274,5.904401,09-2023
6,Southwest Luzon Power Gen Co.,24.77%,14151100.00,6.331946,5.904401,09-2023
7,Energy Development Corp,16.93%,9672000.00,6.0003,5.904401,09-2023
8,Spot Market (WESM),5.61%,3201786.50,6.157472,5.904401,09-2023
9,Sem Calaca Power Corp.,32.87%,18572500.00,4.794143,5.754419,10-2023


In [411]:
import re

def extract_date_key(filename):
    # Define patterns for different filename formats
    patterns = [
        r'(\w+)-(\d{4})',        # Matches "Month-YYYY" (e.g., "June-2024")
        r'(\d{4})-(\d{2})',      # Matches "YYYY-MM" (e.g., "2024-06")
        r'(\d{2})-(\d{2})',      # Matches "MM-YY" (e.g., "06-24")
        r'(\d{4})(\d{2})',       # Matches "YYYYMM" (e.g., "202406")
        r'(\d{2})-(\d{4})',      # Matches "MM-YYYY" (e.g., "06-2024")
        r'(\d{4})-(\w+)',        # Matches "YYYY-Month" (e.g., "2024-July")
        r'(\w+)(\d{4})'          # Matches "MonthYYYY" (e.g., "July2024")
    ]
    
    # Define month abbreviation to number mapping
    month_abbr_to_num = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12',
        'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
        'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
        'November': '11', 'December': '12'
    }
    
    # Define month number to abbreviation mapping
    num_to_month_abbr = {v: k for k, v in month_abbr_to_num.items()}
    
    # Extract text after the first underscore and remove file extension
    parts = filename.split('_', 1)
    filename = parts[1] if len(parts) > 1 else filename
    filename = re.sub(r'\.pdf$', '', filename)
    
    # Try to find a match with each pattern
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            groups = match.groups()
            if len(groups) == 2:
                part1, part2 = groups
                if part1.isdigit() and len(part1) == 4:
                    # Handle YYYY-MM or YYYYMM formats
                    if len(part2) == 2 and part2.isdigit():
                        # Format: YYYY-MM or YYYYMM
                        month_abbr = num_to_month_abbr.get(part2)
                        if month_abbr:
                            return f"{month_abbr}-{part1}"  # MMM-YYYY format
                        else:
                            raise ValueError(f"Invalid month in filename: {filename}")
                    else:
                        raise ValueError(f"Unexpected part2 format in filename: {filename}")
                elif len(part1) == 3 and part2.isdigit() and len(part2) == 4:
                    # Handle Month-YYYY format
                    month_abbr = month_abbr_to_num.get(part1)
                    if month_abbr:
                        return f"{month_abbr}-{part2}"  # MMM-YYYY format
                    else:
                        raise ValueError(f"Invalid month abbreviation in filename: {filename}")
                else:
                    raise ValueError(f"Unexpected format in filename: {filename}")
            elif len(groups) == 3:
                month_abbr, year = groups[1], groups[2]
                # Handle Month-YYMM format
                month_num = month_abbr_to_num.get(month_abbr)
                if month_num:
                    return f"{month_num}-{year[:4]}"  # MMM-YYYY format
                else:
                    raise ValueError(f"Invalid month abbreviation in filename: {filename}")
            elif len(groups) == 1 and groups[0] in month_abbr_to_num:
                # Handle MonthYYYY format
                month_abbr = month_abbr_to_num[groups[0]]
                return f"{month_abbr}-{year}"  # MMM-YYYY format

    # If no pattern matches, return an error
    raise ValueError(f"Date not found in filename: {filename}")

# Example usage
filenames = [
    "GenCharge_202101.pdf",
    "GenCharge_202102.pdf",
    "GenCharge_202103.pdf",
    "GenCharge_202107.pdf",
    "GenCharge_202106.pdf",
    "GenCharge_202104.pdf",
    "GenCharge_202105.pdf"
]

for filename in filenames:
    try:
        print(f"Date for {filename}: {extract_date_key(filename)}")
    except ValueError as e:
        print(f"Error processing {filename}: {e}")


Date for GenCharge_202101.pdf: January-2021
Date for GenCharge_202102.pdf: February-2021
Date for GenCharge_202103.pdf: March-2021
Date for GenCharge_202107.pdf: July-2021
Date for GenCharge_202106.pdf: June-2021
Date for GenCharge_202104.pdf: April-2021
Date for GenCharge_202105.pdf: May-2021


#### for dataframes w/o column headers

In [393]:
supplier_absent_keys.keys()

dict_keys(['GenCharge_Mar2020.pdf', 'GenCharge_Jul2020.pdf', 'GenCharge_May2020.pdf', 'GenCharge_Aug2020.pdf', 'GenCharge_Oct2020.pdf', 'GenCharge_Sep2020.pdf', 'GenCharge_Apr2020.pdf', 'GenCharge_Dec2020.pdf', 'GenCharge_Nov2020.pdf'])

In [394]:
# Iterate over each item in the dictionary
for key, df in supplier_absent_keys.items():
    # Define the keywords to search for
    keywords = ["CONTRACTS", "%", "Purchased", "Average"]

    # Create a list to store the column indices that match the criteria
    matching_columns = []

    # Find columns where the keywords are present as row values
    for keyword in keywords:
        if keyword == "Purchased":
            # For "Purchased", we need to handle the second occurrence
            purchased_columns = df.apply(lambda col: col.str.contains(keyword, na=False)).any()
            # Get the indices of all columns with "Purchased"
            purchased_indices = purchased_columns[purchased_columns].index.tolist()
            if len(purchased_indices) > 1:
                # Select the second occurrence
                matching_columns.append(purchased_indices[1])
        else:
            # For all other keywords, find columns containing the keyword as a row value
            matching_columns += df.columns[df.apply(lambda col: col.str.contains(keyword, na=False)).any()].tolist()

    # Drop duplicates in case there are overlaps in columns
    matching_columns = pd.Index(matching_columns).unique()

    # Create the new DataFrame with the selected columns
    sliced_columns = df[matching_columns]

    # Attempt to find the index of the row containing the word "CONTRACTS" in the first column
    contract_indices = sliced_columns[sliced_columns.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index

    if not contract_indices.empty:
        # If "CONTRACTS" is found, use it as the start index
        start_index = contract_indices[0] + 1
    else:
        # If "CONTRACTS" is not found, find the first non-null value in the first column
        first_non_null_index = sliced_columns[sliced_columns.iloc[:, 0].notnull()].index[0]
        start_index = first_non_null_index  # Include this row as well

    # Find the index of the row containing the word "WESM" in the first column
    end_index = sliced_columns[
        sliced_columns.iloc[:, 0].str.contains(r'WESM', case=False, na=False)
    ].index[0]

    # Slice the DataFrame to include rows between start_index and end_index (inclusive)
    sliced_rows = sliced_columns.iloc[start_index:end_index + 1]

    # Exclude the row that contains the word "Metering" in the first column
    sliced_rows = sliced_rows[~sliced_rows.iloc[:, 0].str.contains('Metering', case=False, na=False)]

    # Uncomment and adjust the following line if you need to rename columns
    sliced_rows.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

    # Function to clean the "Power Supplier" column
    def clean_power_supplier(value):
        # Remove leading numbers and periods
        value = re.sub(r'^\d+\.\s*', '', value)
        return value

    # Apply the cleaning function to the first column of sliced_rows
    sliced_rows['Power Supplier'] = sliced_rows['Power Supplier'].apply(clean_power_supplier)

    # Find the column that contains the word "Average"
    average_column = sliced_columns.columns[sliced_columns.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]

    # Find rows where the first column contains the word "TOTAL"
    total_row = sliced_columns.loc[sliced_columns.iloc[:, 0].str.contains("TOTAL", case=False, na=False), average_column]
    reversed_row = total_row[::-1]

    for value in reversed_row:
        if pd.notnull(value):
            total_row_value = value
            break  # Exit loop once a non-null value is found

    # Add the "Generation Charge" to the sliced_rows DataFrame
    sliced_rows["Generation Charge"] = total_row_value

    # Add the date column
    date_str = extract_date_key(key)
    sliced_rows['Date'] = date_str

    # Append to big_df
    big_df = pd.concat([big_df, sliced_rows], ignore_index=True)

big_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge,Date
0,Sem Calaca Power Corp.,25.84%,17825000.00,4.60928,3.983329,06-24
1,KEPCO SPC Power Corp.,20.58%,14200000.00,4.871499,3.983329,06-24
2,Energy Development Corp,14.05%,9692000.00,6.346905,3.983329,06-24
3,Spot Market (WESM),39.30%,27109551.24,2.28216,3.983329,06-24
4,Sem Calaca Power Corp.,29.98%,18125000.00,4.835937,7.557206,07-24
5,KEPCO SPC Power Corp.,23.99%,14500000.00,4.931647,7.557206,07-24
6,Energy Development Corp,16.01%,9678000.00,6.181348,7.557206,07-24
7,Spot Market (WESM),,,,7.557206,07-24
8,Sem Calaca Power Corp.,30.01%,16882500.00,4.916333,5.270779,01-24
9,KEPCO SPC Power Corp.,19.99%,11245000.00,5.998242,5.270779,01-24


#### Creating Supplier Dataframe

In [395]:
unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers

array(['Sem Calaca Power Corp.', 'KEPCO SPC Power Corp.',
       'Energy Development Corp', 'Spot Market (WESM)',
       'PSALM Corporation', 'Sem Calaca Power Corp',
       'KEPCO SPC Power Corp', 'Southwest Luzon Power Gen Corp',
       'Southwest Luzon Power Gen Co.', 'Southwest Luzon Power Gen Corp.',
       'Sem Calaca Power', 'KEPCO SPC Power', 'Southwest Luzon Power Gen',
       'Sem-Calaca Power Corp',
       'Panay Energy Development Corporation U1/U2',
       'Panay Energy Development Corporation U3',
       'Panay Power Corporation', 'KEPCO SPC Power Corporation',
       'Aboitiz Power Renewables, Incorporated',
       'Subtotal - Interim Power Supply Contracts',
       'Wholesale Electricity Spot Market (WESM)',
       'Panay Energy Development Corporation',
       'Subtotal - Emergency Power Supply Contracts'], dtype=object)

In [396]:
# Define the mapping dictionary
mapping_dict = {
    "Sem Calaca Power Corp": "Sem Calaca Power Corp.",
    "KEPCO SPC Power": "KEPCO SPC Power Corp.",
    "KEPCO SPC Power Corp": "KEPCO SPC Power Corp.",
    "Southwest Luzon Power Gen Corp": "Southwest Luzon Power Gen Corp.",
    "Southwest Luzon Power Gen Co.": "Southwest Luzon Power Gen Corp.",
    "Sem Calaca Power": "Sem Calaca Power Corp.",
    "Southwest Luzon Power Gen": "Southwest Luzon Power Gen Corp.",
    "Sem-Calaca Power Corp": "Sem Calaca Power Corp.",
    "KEPCO SPC Power Corporation": "KEPCO SPC Power Corp.",
    "Spot Market (WESM)": "Wholesale Electricity Spot Market (WESM)"
}

# Apply the replacements in the 'Power Supplier' column
big_df['Power Supplier'] = big_df['Power Supplier'].replace(mapping_dict)

In [397]:
unique_suppliers = big_df['Power Supplier'].unique()
unique_suppliers

array(['Sem Calaca Power Corp.', 'KEPCO SPC Power Corp.',
       'Energy Development Corp',
       'Wholesale Electricity Spot Market (WESM)', 'PSALM Corporation',
       'Southwest Luzon Power Gen Corp.',
       'Panay Energy Development Corporation U1/U2',
       'Panay Energy Development Corporation U3',
       'Panay Power Corporation',
       'Aboitiz Power Renewables, Incorporated',
       'Subtotal - Interim Power Supply Contracts',
       'Panay Energy Development Corporation',
       'Subtotal - Emergency Power Supply Contracts'], dtype=object)

In [398]:
# Filter out any suppliers that contain the string "total" (case-insensitive)
filtered_suppliers = [supplier for supplier in unique_suppliers if isinstance(supplier, str) and "total" not in supplier.lower()]

# Create a mapping of the filtered power suppliers to unique IDs
supplier_id_map = {supplier: id+1 for id, supplier in enumerate(filtered_suppliers)}

# Create a new DataFrame from the mapping
supplier_df = pd.DataFrame(list(supplier_id_map.items()), columns=['Power Suppliers', 'Power Supplier ID'])

supplier_df

Unnamed: 0,Power Suppliers,Power Supplier ID
0,Sem Calaca Power Corp.,1
1,KEPCO SPC Power Corp.,2
2,Energy Development Corp,3
3,Wholesale Electricity Spot Market (WESM),4
4,PSALM Corporation,5
5,Southwest Luzon Power Gen Corp.,6
6,Panay Energy Development Corporation U1/U2,7
7,Panay Energy Development Corporation U3,8
8,Panay Power Corporation,9
9,"Aboitiz Power Renewables, Incorporated",10


In [399]:
# Create a mapping from Power Suppliers to Supplier IDs
supplier_mapping = dict(zip(supplier_df['Power Suppliers'], supplier_df['Power Supplier ID']))

# Replace names with IDs in big_df
big_df['Power Supplier ID'] = big_df['Power Supplier'].map(supplier_mapping)

# Convert Power Supplier ID to integer, handling NaN values
big_df['Power Supplier ID'] = big_df['Power Supplier ID'].fillna(0).astype(int)

# Drop the old Power Supplier column
big_df = big_df.drop(columns=['Power Supplier'])

big_df

Unnamed: 0,%,kWh,Average Generation Cost,Generation Charge,Date,Power Supplier ID
0,25.84%,17825000.00,4.60928,3.983329,06-24,1
1,20.58%,14200000.00,4.871499,3.983329,06-24,2
2,14.05%,9692000.00,6.346905,3.983329,06-24,3
3,39.30%,27109551.24,2.28216,3.983329,06-24,4
4,29.98%,18125000.00,4.835937,7.557206,07-24,1
5,23.99%,14500000.00,4.931647,7.557206,07-24,2
6,16.01%,9678000.00,6.181348,7.557206,07-24,3
7,,,,7.557206,07-24,4
8,30.01%,16882500.00,4.916333,5.270779,01-24,1
9,19.99%,11245000.00,5.998242,5.270779,01-24,2


In [400]:
%pip install openpyxl

with pd.ExcelWriter("Historical_MEPC_GC_Breakdown.xlsx", engine='openpyxl') as writer:
    big_df.to_excel(writer, sheet_name='Historical GC', index=False)
    supplier_df.to_excel(writer, sheet_name='Supplier IDs', index=False)

Note: you may need to restart the kernel to use updated packages.


#### for troubleshooting loop

##### for dataframes w/ column headers

In [326]:
supplier_present_keys.keys()

dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf', 'Generation-Charge-for-August-202

In [327]:
df = supplier_present_keys["Generation-Charge_June-2024_Web-Upload_revised0618.pdf"]
df

Unnamed: 0,Power\rSupplier,% Share on Energy\rPurchased,Energy\rPurchased\rkWh,Basic\rGeneration Cost\rPhP,Other Cost\rAdjustment\rPhP,Total\rGeneration Cost\rPhP
0,,(A),(B),(C),(D = B+C),
1,Sem Calaca Power Corp.,25.84%,17825000.00,82160418.62,-,82160418.62
2,KEPCO SPC Power Corp.,20.58%,14200000.00,69085633.34,89657.14,69175290.48
3,Energy Development Corp,14.05%,9692000.00,59882207.60,1631995.84,61514203.44
4,Spot Market (WESM),39.30%,27109551.24,61437954.98,430378.92,61868333.90
5,Net Metering Export Energy,0.23%,161508.00,1019145.96,,1019145.96
6,SUBTOTAL for Power Suppliers,100.00%,68988059.24,273585360.50,2152031.90,275737392.40
7,Others:,,,,,
8,Pilferage Cost Recoveries,,,,,"(935,260.84)"
9,Prompt Payment Discounts,,,,,


In [328]:
#Slicing columns

# Define the keywords to look for in the column names
keywords = ["Power", "%", "Purchased", "Total"]

# Filter the columns based on the presence of any of the keywords
filtered_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]

# Create the new DataFrame with the filtered columns
sliced_columns = df[filtered_columns]

sliced_columns.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]
sliced_columns

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
0,,(A),(B),
1,Sem Calaca Power Corp.,25.84%,17825000.00,82160418.62
2,KEPCO SPC Power Corp.,20.58%,14200000.00,69175290.48
3,Energy Development Corp,14.05%,9692000.00,61514203.44
4,Spot Market (WESM),39.30%,27109551.24,61868333.90
5,Net Metering Export Energy,0.23%,161508.00,1019145.96
6,SUBTOTAL for Power Suppliers,100.00%,68988059.24,275737392.40
7,Others:,,,
8,Pilferage Cost Recoveries,,,"(935,260.84)"
9,Prompt Payment Discounts,,,


In [333]:
#Slicing rows

# Step 1: Find the first non-null row index in the first column
first_non_null_idx = sliced_columns[sliced_columns.iloc[:, 1].notna()].index[0]

# Step 2: Find the row index where the value in the first column is "WESM"
wesm_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("WESM", na=False, case=False)].index

# Step 3: Find the row index where the value in the first column is "TOTAL"
total_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.strip().eq("TOTAL")].index

# Step 4: Slice the DataFrame to include the first non-null row, up to "WESM", and include "TOTAL"
if not wesm_row_idx.empty and not total_row_idx.empty:
    wesm_row_idx = wesm_row_idx[0]  # Get the index of the "WESM" row
    total_row_idx = total_row_idx[0]  # Get the index of the "TOTAL" row
    
    # Slice rows from the first non-null row to the "WESM" row (inclusive)
    sliced_rows = sliced_columns.loc[first_non_null_idx:wesm_row_idx]
    
    # Append the "TOTAL" row if it's not already in the slice
    if total_row_idx > wesm_row_idx:
        total_row = sliced_columns.loc[[total_row_idx]]
        sliced_rows = pd.concat([sliced_rows, total_row], ignore_index=True)

sliced_rows.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]
sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
0,,(A),(B),
1,Sem Calaca Power Corp.,25.84%,17825000.00,82160418.62
2,KEPCO SPC Power Corp.,20.58%,14200000.00,69175290.48
3,Energy Development Corp,14.05%,9692000.00,61514203.44
4,Spot Market (WESM),39.30%,27109551.24,61868333.9
5,TOTAL,,68988059.24,274802131.56


In [334]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    if pd.isnull(value):
        return value  # Return null if the value is null
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_rows
sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)

# Drop rows where the first column value is null
sliced_rows = sliced_rows.dropna(subset=[sliced_rows.columns[0]])

# Reset the index after dropping rows
sliced_rows = sliced_rows.reset_index(drop=True)

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
0,Sem Calaca Power Corp.,25.84%,17825000.0,82160418.62
1,KEPCO SPC Power Corp.,20.58%,14200000.0,69175290.48
2,Energy Development Corp,14.05%,9692000.0,61514203.44
3,Spot Market (WESM),39.30%,27109551.24,61868333.9
4,TOTAL,,68988059.24,274802131.56


In [335]:
# Find the row where the value in the first column is "TOTAL"
total_row = sliced_columns[sliced_columns.iloc[:, 0].str.contains("TOTAL", na=False)]

# Extract the values for "Total Generation Cost" and "kWh"
if not total_row.empty:
    total_generation_cost_str = total_row['Total Generation Cost'].values[0]
    kwh_purchased_str = total_row['kWh'].values[0]

    # Convert the string values to float by removing commas
    total_generation_cost = float(total_generation_cost_str.replace(',', ''))
    kwh_purchased = float(kwh_purchased_str.replace(',', ''))

    # Calculate the Average Generation Cost for each row
    def calculate_average_generation_cost(row):
        try:
            total_cost = float(row['Total Generation Cost'].replace(',', ''))
            kwh = float(row['kWh'].replace(',', ''))
            return total_cost / kwh
        except (ValueError, TypeError):
            return np.nan

    sliced_rows['Average Generation Cost'] = sliced_rows.apply(calculate_average_generation_cost, axis=1)

    # Drop the "Total Generation Cost" column
    sliced_rows = sliced_rows.drop(columns=['Total Generation Cost'])

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,Sem Calaca Power Corp.,25.84%,17825000.0,4.60928
1,KEPCO SPC Power Corp.,20.58%,14200000.0,4.871499
2,Energy Development Corp,14.05%,9692000.0,6.346905
3,Spot Market (WESM),39.30%,27109551.24,2.28216
4,TOTAL,,68988059.24,3.983329


In [342]:
# Step 1: Find the row index where the value in the first column is exactly "TOTAL"
total_row_idx = sliced_rows[sliced_rows.iloc[:, 0] == "TOTAL"].index[0]

# Step 2: Extract the value from the "Average Generation Cost" column in the "TOTAL" row
generation_charge_value = sliced_rows.at[total_row_idx, "Average Generation Cost"]

# Step 3: Add a new column "Generation Charge" with this value repeated for all rows
sliced_rows["Generation Charge"] = generation_charge_value

# Step 4: Remove the "TOTAL" row
sliced_rows = sliced_rows.drop(total_row_idx)

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge
0,Sem Calaca Power Corp.,25.84%,17825000.0,4.60928,3.983329
1,KEPCO SPC Power Corp.,20.58%,14200000.0,4.871499,3.983329
2,Energy Development Corp,14.05%,9692000.0,6.346905,3.983329
3,Spot Market (WESM),39.30%,27109551.24,2.28216,3.983329


##### for dataframes w/o column headers

In [351]:
supplier_absent_keys.keys()

dict_keys(['GenCharge_Mar2020.pdf', 'GenCharge_Jul2020.pdf', 'GenCharge_May2020.pdf', 'GenCharge_Aug2020.pdf', 'GenCharge_Oct2020.pdf', 'GenCharge_Sep2020.pdf', 'GenCharge_Apr2020.pdf', 'GenCharge_Dec2020.pdf', 'GenCharge_Nov2020.pdf'])

In [369]:
df = supplier_absent_keys["GenCharge_Nov2020.pdf"]
df

Unnamed: 0.1,GENERATION CHARGE for NOVEMBER 2020,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Power Supplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Basic Generation\rCost\rPhP,Other Cost\rAdjustment\rPhP,Total Generation\rCost for the Month\rPhP,Average\rGeneration Rate\rPhP/kWh
1,,(A),(B),(C),(D = B+C),D/A,
2,BILATERAL CONTRACTS,,,,,,
3,1. Panay Energy Development Corporation U1/U2,41.32%,19722394.09,133216421.83,-,133216421.83,6.7546
4,2. Panay Energy Development Corporation U3,29.84%,14244166.58,85810805.18,-,85810805.18,6.0243
5,3. Panay Power Corporation,0.23%,111676.64,25455030.55,-,25455030.55,227.9351
6,4. KEPCO SPC Power Corporation,7.54%,3600000.00,16825668.60,-,16825668.60,4.6738
7,"5. Aboitiz Power Renewables, Incorporated",6.61%,3155000.00,21246597.67,-,21246597.67,6.7343
8,Subtotal - Emergency Power Supply Contracts,85.55%,40833237.31,282554523.83,-,282554523.83,6.9197
9,Wholesale Electricity Spot Market (WESM),14.40%,6875306.91,14985806.32,-,"14,985,806.32.1",2.1797


In [370]:
# Define the keywords to search for
keywords = ["CONTRACTS", "%", "Purchased", "Average"]

# Create a list to store the column indices that match the criteria
matching_columns = []

# Find columns where the keywords are present as row values
for keyword in keywords:
    if keyword == "Purchased":
        # For "Purchased", we need to handle the second occurrence
        purchased_columns = df.apply(lambda col: col.str.contains(keyword, na=False)).any()
        # Get the indices of all columns with "Purchased"
        purchased_indices = purchased_columns[purchased_columns].index.tolist()
        if len(purchased_indices) > 1:
            # Select the second occurrence
            matching_columns.append(purchased_indices[1])
    else:
        # For all other keywords, find columns containing the keyword as a row value
        matching_columns += df.columns[df.apply(lambda col: col.str.contains(keyword, na=False)).any()].tolist()

# Drop duplicates in case there are overlaps in columns
matching_columns = pd.Index(matching_columns).unique()

# Create the new DataFrame with the selected columns
sliced_columns = df[matching_columns]

sliced_columns

Unnamed: 0.1,GENERATION CHARGE for NOVEMBER 2020,Unnamed: 0,Unnamed: 1,Unnamed: 5
0,Power Supplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Average\rGeneration Rate\rPhP/kWh
1,,(A),(B),
2,BILATERAL CONTRACTS,,,
3,1. Panay Energy Development Corporation U1/U2,41.32%,19722394.09,6.7546
4,2. Panay Energy Development Corporation U3,29.84%,14244166.58,6.0243
5,3. Panay Power Corporation,0.23%,111676.64,227.9351
6,4. KEPCO SPC Power Corporation,7.54%,3600000.00,4.6738
7,"5. Aboitiz Power Renewables, Incorporated",6.61%,3155000.00,6.7343
8,Subtotal - Emergency Power Supply Contracts,85.55%,40833237.31,6.9197
9,Wholesale Electricity Spot Market (WESM),14.40%,6875306.91,2.1797


In [371]:
# Attempt to find the index of the row containing the word "CONTRACTS" in the first column
contract_indices = sliced_columns[sliced_columns.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index

if not contract_indices.empty:
    # If "CONTRACTS" is found, use it as the start index
    start_index = contract_indices[0] + 1
else:
    # If "CONTRACTS" is not found, find the first non-null value in the first column
    first_non_null_index = sliced_columns[sliced_columns.iloc[:, 0].notnull()].index[0]
    start_index = first_non_null_index  # Include this row as well

# Find the index of the row containing the word "WESM" in the first column
end_index = sliced_columns[
    sliced_columns.iloc[:, 0].str.contains(r'WESM', case=False, na=False)
].index[0]

# Slice the DataFrame to include rows between start_index and end_index (inclusive)
sliced_rows = sliced_columns.iloc[start_index:end_index + 1]

# Exclude the row that contains the word "Metering" in the first column
sliced_rows = sliced_rows[~sliced_rows.iloc[:, 0].str.contains('Metering', case=False, na=False)]

# Uncomment and adjust the following line if you need to rename columns
sliced_rows.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
3,1. Panay Energy Development Corporation U1/U2,41.32%,19722394.09,6.7546
4,2. Panay Energy Development Corporation U3,29.84%,14244166.58,6.0243
5,3. Panay Power Corporation,0.23%,111676.64,227.9351
6,4. KEPCO SPC Power Corporation,7.54%,3600000.0,4.6738
7,"5. Aboitiz Power Renewables, Incorporated",6.61%,3155000.0,6.7343
8,Subtotal - Emergency Power Supply Contracts,85.55%,40833237.31,6.9197
9,Wholesale Electricity Spot Market (WESM),14.40%,6875306.91,2.1797


In [372]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_df
sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)

sliced_rows_df = sliced_rows.reset_index(drop=True)
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,Panay Energy Development Corporation U1/U2,41.32%,19722394.09,6.7546
1,Panay Energy Development Corporation U3,29.84%,14244166.58,6.0243
2,Panay Power Corporation,0.23%,111676.64,227.9351
3,KEPCO SPC Power Corporation,7.54%,3600000.0,4.6738
4,"Aboitiz Power Renewables, Incorporated",6.61%,3155000.0,6.7343
5,Subtotal - Emergency Power Supply Contracts,85.55%,40833237.31,6.9197
6,Wholesale Electricity Spot Market (WESM),14.40%,6875306.91,2.1797


In [373]:
# Find rows where the first column contains the word "TOTAL"
total_row = sliced_columns.loc[sliced_columns.iloc[:, 0].str.contains("TOTAL", case=False, na=False), average_column]
reversed_row = total_row[::-1]

for value in reversed_row:
    if pd.notnull(value):
        total_row_value = value
        break  # Exit loop once a non-null value is found

In [375]:
sliced_rows["Generation Charge"] = total_row_value
sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge
3,Panay Energy Development Corporation U1/U2,41.32%,19722394.09,6.7546,6.2367
4,Panay Energy Development Corporation U3,29.84%,14244166.58,6.0243,6.2367
5,Panay Power Corporation,0.23%,111676.64,227.9351,6.2367
6,KEPCO SPC Power Corporation,7.54%,3600000.0,4.6738,6.2367
7,"Aboitiz Power Renewables, Incorporated",6.61%,3155000.0,6.7343,6.2367
8,Subtotal - Emergency Power Supply Contracts,85.55%,40833237.31,6.9197,6.2367
9,Wholesale Electricity Spot Market (WESM),14.40%,6875306.91,2.1797,6.2367
