In [122]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [123]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [124]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [125]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

In [126]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

url = "https://morepower.com.ph/monthly-rates/"
pdf_folder = "pdf downloads"

# Create the folder if it doesn't exist
os.makedirs(pdf_folder, exist_ok=True)

# Define headers to include in the request
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send a GET request to the webpage with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links that end with .pdf
    pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
    
    # Download each PDF
    for link in pdf_links:
        pdf_url = urljoin(url, link)  # Create the full URL if it's relative
        pdf_filename = os.path.join(pdf_folder, os.path.basename(pdf_url))
        
        # Send a GET request to the PDF URL
        pdf_response = requests.get(pdf_url, headers=headers)
        
        if pdf_response.status_code == 200:
            # Save the PDF content
            with open(pdf_filename, "wb") as pdf_file:
                pdf_file.write(pdf_response.content)
            print(f"Downloaded {pdf_filename}")
        else:
            print(f"Failed to download {pdf_url}. Status code: {pdf_response.status_code}")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Downloaded pdf downloads/Generation-Charge_July2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2024_Web-Upload_revised0618.pdf
Downloaded pdf downloads/Generation-Charge_May-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Apr-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Mar-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Feb-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Jan-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Dec-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Nov-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Oct-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Sep-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_August-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_July-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge-f

### **Processing**

In [135]:
# Initialize dictionaries to store keys for DataFrames with and without "Supplier"
supplier_present_keys = {}
supplier_absent_keys = {}

# Loop through all PDF files in the folder
for filename in os.listdir(pdf_folder_path):
    try:
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder_path, filename)

            # Read all tables from the specified page(s)
            tables = tabula.read_pdf(pdf_path, lattice=True, pages='all', multiple_tables=True)

            # Ensure there is at least one table
            if not tables:
                raise ValueError(f"No tables found in the PDF: {filename}")

            # Process the first table (assuming it has headers)
            first_table = tables[0]

            # Extract column names from the first table
            first_table_columns = first_table.columns

            # Initialize a list to store processed DataFrames
            processed_tables = [first_table]

            # Process each subsequent table
            for table in tables[1:]:
                # Convert the header row of the current table to a DataFrame row
                table_header_as_row = pd.DataFrame([table.columns.tolist()], columns=table.columns)
                
                # Append the header row to the current table data
                table_with_header_as_row = pd.concat([table_header_as_row, table], ignore_index=True)

                # Check if the number of columns matches
                if len(table_with_header_as_row.columns) != len(first_table_columns):
                    # Align columns by adding missing columns with NaN values
                    missing_cols = len(first_table_columns) - len(table_with_header_as_row.columns)
                    if missing_cols > 0:
                        # Add missing columns with NaN values
                        table_with_header_as_row = pd.concat(
                            [table_with_header_as_row, pd.DataFrame(columns=[f"Column_{i+1}" for i in range(missing_cols)])], 
                            axis=1
                        )
                    elif missing_cols < 0:
                        # Truncate extra columns
                        table_with_header_as_row = table_with_header_as_row.iloc[:, :len(first_table_columns)]
                
                # Rename columns of the current table to match the first table
                table_with_header_as_row.columns = first_table_columns
                
                # Append the table to the list of processed tables
                processed_tables.append(table_with_header_as_row)

            # Combine all tables into a single DataFrame, stacking vertically
            df = pd.concat(processed_tables, ignore_index=True)

            # Check if any of the column headers contain the word "Supplier"
            if any('Supplier' in col for col in df.columns):
                # Store the key in supplier_present_keys dictionary
                supplier_present_keys[filename] = df
            else:
                # Store the key in supplier_absent_keys dictionary
                supplier_absent_keys[filename] = df

    except Exception as e:
        print(f"Error processing file {filename}: {e}")

# Output the keys for verification
print("Files with 'Supplier' in headers:", supplier_present_keys.keys())
print("Files without 'Supplier' in headers:", supplier_absent_keys.keys())

Files with 'Supplier' in headers: dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf',

#### for dataframes w/ column headers

In [189]:
supplier_present_keys.keys()

dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf', 'Generation-Charge-for-August-202

In [190]:
# Define a function to extract and format the date from the filename
def extract_date_from_filename(filename):
    # Use regular expressions to match different date formats
    match = re.search(r'(\d{2,4})(?:-|_|-Web-Upload)?(?:_(\d{2,4}))?', filename)
    if match:
        # Handle different formats
        if len(match.group(1)) == 4:  # e.g., 2024
            year = match.group(1)
            month = match.group(2) if match.group(2) else '01'
        elif len(match.group(1)) == 2:  # e.g., 06
            year = '20' + match.group(2)  # Assuming year is in the 2000s
            month = match.group(1)
        else:
            return None
        return f"{month}-{year[-2:]}"
    return None

# Initialize an empty DataFrame to hold all data
big_df = pd.DataFrame()

# Iterate over each item in the dictionary
for key, df in supplier_present_keys.items():
    # Slicing columns
    keywords = ["Supplier", "%", "Purchased", "Total"]
    filtered_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    sliced_columns = df[filtered_columns]
    
    # Slicing rows
    # Step 1: Find the first non-null row index in the first column
    first_non_null_idx = sliced_columns[sliced_columns.iloc[:, 0].notna()].index[0]

    # Step 2: Find the row index where the value in the first column is "WESM"
    wesm_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("WESM", na=False, case=False)].index

    # Step 3: Find the row index where the value in the first column is "TOTAL"
    total_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("TOTAL", na=False, case=False)].index

    # Step 4: Slice the DataFrame to include the first non-null row, up to "WESM", and include "TOTAL"
    if not wesm_row_idx.empty and not total_row_idx.empty:
        wesm_row_idx = wesm_row_idx[0]  # Get the index of the "WESM" row
        total_row_idx = total_row_idx[0]  # Get the index of the "TOTAL" row
        
        # Slice rows from the first non-null row to the "WESM" row (inclusive)
        sliced_rows = sliced_columns.loc[first_non_null_idx:wesm_row_idx]
        
        # Append the "TOTAL" row if it's not already in the slice
        if total_row_idx > wesm_row_idx:
            total_row = sliced_columns.loc[[total_row_idx]]
            sliced_rows = pd.concat([sliced_rows, total_row], ignore_index=True)

    sliced_rows.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]

    # Calculate Average Generation Cost
    def calculate_average_generation_cost(row):
        try:
            total_cost = float(row['Total Generation Cost'].replace(',', ''))
            kwh = float(row['kWh'].replace(',', ''))
            return total_cost / kwh
        except (ValueError, TypeError):
            return np.nan

    sliced_rows['Average Generation Cost'] = sliced_rows.apply(calculate_average_generation_cost, axis=1)
    
    # Drop the "Total Generation Cost" column
    sliced_rows = sliced_rows.drop(columns=['Total Generation Cost'])

    # Add the date column
    date_str = extract_date_from_filename(key)
    sliced_rows['Date'] = date_str

    # Append to big_df
    big_df = pd.concat([big_df, sliced_rows], ignore_index=True)

big_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Date
0,Sem Calaca Power Corp.,25.84%,17825000.00,4.60928,01-24
1,KEPCO SPC Power Corp.,20.58%,14200000.00,4.871499,01-24
2,Energy Development Corp,14.05%,9692000.00,6.346905,01-24
3,Spot Market (WESM),39.30%,27109551.24,2.28216,01-24
4,SUBTOTAL for Power Suppliers,100.00%,68988059.24,3.996886,01-24
5,Sem Calaca Power Corp.,29.98%,18125000.00,4.835937,01-24
6,KEPCO SPC Power Corp.,23.99%,14500000.00,4.931647,01-24
7,Energy Development Corp,16.01%,9678000.00,6.181348,01-24
8,Spot Market (WESM),,,,01-24
9,SUBTOTAL for Power Suppliers,100.00%,60453542.35,7.570099,01-24


#### for dataframes w/o column headers

In [191]:
supplier_absent_keys.keys()

dict_keys(['GenCharge_Mar2020.pdf', 'GenCharge_Jul2020.pdf', 'GenCharge_May2020.pdf', 'GenCharge_Aug2020.pdf', 'GenCharge_Oct2020.pdf', 'GenCharge_Sep2020.pdf', 'GenCharge_Apr2020.pdf', 'GenCharge_Dec2020.pdf', 'GenCharge_202101.pdf', 'GenCharge_202102.pdf', 'GenCharge_Nov2020.pdf', 'GenCharge_202103.pdf', 'GenCharge_202107.pdf', 'GenCharge_202106.pdf', 'GenCharge_202104.pdf', 'GenCharge_202105.pdf'])

#### for troubleshooting loop

##### for dataframes w/ column headers

In [185]:
df = supplier_present_keys["Generation-Charge-for-June-2022.pdf"]
df

Unnamed: 0,Power\rSupplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Basic\rGeneration Cost\rPhP,Other Cost\rAdjustment\rPhP,Total\rGeneration Cost\rPhP
0,,(A),(B),(C),(D = B+C),
1,PSALM Corporation,90.44%,50225000.00,176921595.50,"- 2,537,005.75",174384589.76
2,Spot Market (WESM),9.40%,5217754.48,51112938.24,2685626.41,53798564.65
3,Net Metering Customers,0.16%,90392.70,363315.41,,363315.41
4,Others,,,,,
5,,,,,,
6,TOTAL,100.00%,55533147.18,228397849.15,148620.67,228546469.82
7,,,,,,
8,TOTAL GENERATION CHARGE FOR THE MONTH ...,,,,,


In [186]:
#Slicing columns

# Define the keywords to look for in the column names
keywords = ["Supplier", "%", "Purchased", "Total"]

# Filter the columns based on the presence of any of the keywords
filtered_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]

# Create the new DataFrame with the filtered columns
sliced_columns = df[filtered_columns]

sliced_columns

Unnamed: 0,Power\rSupplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Total\rGeneration Cost\rPhP
0,,(A),(B),
1,PSALM Corporation,90.44%,50225000.00,174384589.76
2,Spot Market (WESM),9.40%,5217754.48,53798564.65
3,Net Metering Customers,0.16%,90392.70,363315.41
4,Others,,,
5,,,,
6,TOTAL,100.00%,55533147.18,228546469.82
7,,,,
8,TOTAL GENERATION CHARGE FOR THE MONTH ...,,,


In [187]:
#Slicing rows

# Step 1: Find the first non-null row index in the first column
first_non_null_idx = sliced_columns[sliced_columns.iloc[:, 0].notna()].index[0]

# Step 2: Find the row index where the value in the first column is "WESM"
wesm_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("WESM", na=False, case=False)].index

# Step 3: Find the row index where the value in the first column is "TOTAL"
total_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("TOTAL", na=False, case=False)].index

# Step 4: Slice the DataFrame to include the first non-null row, up to "WESM", and include "TOTAL"
if not wesm_row_idx.empty and not total_row_idx.empty:
    wesm_row_idx = wesm_row_idx[0]  # Get the index of the "WESM" row
    total_row_idx = total_row_idx[0]  # Get the index of the "TOTAL" row
    
    # Slice rows from the first non-null row to the "WESM" row (inclusive)
    sliced_rows = sliced_columns.loc[first_non_null_idx:wesm_row_idx]
    
    # Append the "TOTAL" row if it's not already in the slice
    if total_row_idx > wesm_row_idx:
        total_row = sliced_columns.loc[[total_row_idx]]
        sliced_rows = pd.concat([sliced_rows, total_row], ignore_index=True)

sliced_rows.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]
sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
0,PSALM Corporation,90.44%,50225000.0,174384589.76
1,Spot Market (WESM),9.40%,5217754.48,53798564.65
2,TOTAL,100.00%,55533147.18,228546469.82


In [188]:
# Find the row where the value in the first column is "TOTAL"
total_row = sliced_rows[sliced_rows.iloc[:, 0].str.contains("TOTAL", na=False)]

# Extract the values for "Total Generation Cost" and "kWh"
if not total_row.empty:
    total_generation_cost_str = total_row['Total Generation Cost'].values[0]
    kwh_purchased_str = total_row['kWh'].values[0]

    # Convert the string values to float by removing commas
    total_generation_cost = float(total_generation_cost_str.replace(',', ''))
    kwh_purchased = float(kwh_purchased_str.replace(',', ''))

    # Calculate the Average Generation Cost for each row
    def calculate_average_generation_cost(row):
        try:
            total_cost = float(row['Total Generation Cost'].replace(',', ''))
            kwh = float(row['kWh'].replace(',', ''))
            return total_cost / kwh
        except (ValueError, TypeError):
            return np.nan

    sliced_rows['Average Generation Cost'] = sliced_rows.apply(calculate_average_generation_cost, axis=1)

    # Drop the "Total Generation Cost" column
    sliced_rows = sliced_rows.drop(columns=['Total Generation Cost'])

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,PSALM Corporation,90.44%,50225000.0,3.472067
1,Spot Market (WESM),9.40%,5217754.48,10.310674
2,TOTAL,100.00%,55533147.18,4.115496


##### for dataframes w/o column headers

In [192]:
df = supplier_absent_keys["GenCharge_Jul2020.pdf"]
df

Unnamed: 0.1,GENERATION CHARGE for JUNE 2020,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Power Supplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Basic Generation\rCost\rPhP,Other Cost\rAdjustment\rPhP,Total Generation\rCost for the Month\rPhP,Average\rGeneration Rate\rPhP/KWH
1,,(A),(B),(C),(D = B+C),(D/A),
2,BILATERAL CONTRACTS,,,,,,
3,1. Panay Energy Development Corporation,70.58%,39843995.26,247045863.94,-,247045863.94,6.2003
4,2. Panay Power Corporation,2.18%,1229643.20,32927330.56,-,32927330.56,26.7780
5,3. KEPCO SPC Power Corporation,6.59%,3720000.00,17388808.81,-,17388808.81,4.6744
6,"4. Aboitiz Power Renewables, Incorporated",13.18%,7440000.00,31172724.00,-,31172724.00,4.1899
7,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,328534727.32,-,328534727.32,6.2897
8,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,13420570.11,887474.90,14308045.01,3.3897
9,Export Energy from Net Metering Customers,0.00%,-,-,-,-,-


In [193]:
# Define the keywords to search for
keywords = ["CONTRACTS", "%", "Purchased", "Average"]

# Create a list to store the column indices that match the criteria
matching_columns = []

# Find columns where the keywords are present as row values
for keyword in keywords:
    if keyword == "Purchased":
        # For "Purchased", we need to handle the second occurrence
        purchased_columns = df.apply(lambda col: col.str.contains(keyword, na=False)).any()
        # Get the indices of all columns with "Purchased"
        purchased_indices = purchased_columns[purchased_columns].index.tolist()
        if len(purchased_indices) > 1:
            # Select the second occurrence
            matching_columns.append(purchased_indices[1])
    else:
        # For all other keywords, find columns containing the keyword as a row value
        matching_columns += df.columns[df.apply(lambda col: col.str.contains(keyword, na=False)).any()].tolist()

# Drop duplicates in case there are overlaps in columns
matching_columns = pd.Index(matching_columns).unique()

# Create the new DataFrame with the selected columns
sliced_columns = df[matching_columns]

sliced_columns

Unnamed: 0.1,GENERATION CHARGE for JUNE 2020,Unnamed: 0,Unnamed: 1,Unnamed: 5
0,Power Supplier,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Average\rGeneration Rate\rPhP/KWH
1,,(A),(B),
2,BILATERAL CONTRACTS,,,
3,1. Panay Energy Development Corporation,70.58%,39843995.26,6.2003
4,2. Panay Power Corporation,2.18%,1229643.20,26.7780
5,3. KEPCO SPC Power Corporation,6.59%,3720000.00,4.6744
6,"4. Aboitiz Power Renewables, Incorporated",13.18%,7440000.00,4.1899
7,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897
8,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897
9,Export Energy from Net Metering Customers,0.00%,-,-


In [194]:
# Attempt to find the index of the row containing the word "CONTRACTS" in the first column
contract_indices = sliced_columns[sliced_columns.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index

if not contract_indices.empty:
    # If "CONTRACTS" is found, use it as the start index
    start_index = contract_indices[0] + 1
else:
    # If "CONTRACTS" is not found, find the first non-null value in the first column
    first_non_null_index = sliced_columns[sliced_columns.iloc[:, 0].notnull()].index[0]
    start_index = first_non_null_index  # Include this row as well

# Find the index of the row containing the word "WESM" in the first column
wesm_index = sliced_columns[sliced_columns.iloc[:, 0].str.contains(r'WESM', case=False, na=False)].index[0]

# Slice the DataFrame to include rows between start_index and end_index (inclusive)
sliced_rows = sliced_columns.iloc[start_index:end_index + 1]

# Exclude the row that contains the word "Metering" in the first column
sliced_rows = sliced_rows[~sliced_rows.iloc[:, 0].str.contains('Metering', case=False, na=False)]

# Uncomment and adjust the following line if you need to rename columns
sliced_rows.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
3,1. Panay Energy Development Corporation,70.58%,39843995.26,6.2003
4,2. Panay Power Corporation,2.18%,1229643.2,26.778
5,3. KEPCO SPC Power Corporation,6.59%,3720000.0,4.6744
6,"4. Aboitiz Power Renewables, Incorporated",13.18%,7440000.0,4.1899
7,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897
8,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897


In [195]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_df
sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)

sliced_rows_df = sliced_rows.reset_index(drop=True)
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,Panay Energy Development Corporation,70.58%,39843995.26,6.2003
1,Panay Power Corporation,2.18%,1229643.2,26.778
2,KEPCO SPC Power Corporation,6.59%,3720000.0,4.6744
3,"Aboitiz Power Renewables, Incorporated",13.18%,7440000.0,4.1899
4,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897
5,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897


In [196]:
# Find the column that contains the word "Average"
average_column = sliced_columns.columns[sliced_columns.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]

# Find rows where the first column contains the word "TOTAL"
total_row = sliced_columns.loc[sliced_columns.iloc[:, 0].str.contains("TOTAL", case=False, na=False), average_column]
reversed_row = total_row[::-1]

for value in reversed_row:
    if pd.notnull(value):
        total_row_value = value
        break  # Exit loop once a non-null value is found

In [197]:
sliced_rows_df["Generation Charge"] = total_row_value
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge
0,Panay Energy Development Corporation,70.58%,39843995.26,6.2003,6.0729
1,Panay Power Corporation,2.18%,1229643.2,26.778,6.0729
2,KEPCO SPC Power Corporation,6.59%,3720000.0,4.6744,6.0729
3,"Aboitiz Power Renewables, Incorporated",13.18%,7440000.0,4.1899,6.0729
4,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897,6.0729
5,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897,6.0729
