In [122]:
import os
import re
import tabula
import pandas as pd
import numpy as np

In [123]:
# Set the JAVA_HOME environment variable to the Java installation directory
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk"

In [124]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [125]:
%pip install -q tabula-py
%pip install requests beautifulsoup4
%pip install openpyxl
%pip install selenium
%pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### **Importing**

In [126]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

url = "https://morepower.com.ph/monthly-rates/"
pdf_folder = "pdf downloads"

# Create the folder if it doesn't exist
os.makedirs(pdf_folder, exist_ok=True)

# Define headers to include in the request
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send a GET request to the webpage with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links that end with .pdf
    pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
    
    # Download each PDF
    for link in pdf_links:
        pdf_url = urljoin(url, link)  # Create the full URL if it's relative
        pdf_filename = os.path.join(pdf_folder, os.path.basename(pdf_url))
        
        # Send a GET request to the PDF URL
        pdf_response = requests.get(pdf_url, headers=headers)
        
        if pdf_response.status_code == 200:
            # Save the PDF content
            with open(pdf_filename, "wb") as pdf_file:
                pdf_file.write(pdf_response.content)
            print(f"Downloaded {pdf_filename}")
        else:
            print(f"Failed to download {pdf_url}. Status code: {pdf_response.status_code}")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Downloaded pdf downloads/Generation-Charge_July2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2024_Web-Upload_revised0618.pdf
Downloaded pdf downloads/Generation-Charge_May-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Apr-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Mar-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Feb-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Jan-2024_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Dec-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Nov-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Oct-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_Sep-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_August-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_July-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge_June-2023_Web-Upload.pdf
Downloaded pdf downloads/Generation-Charge-f

### **Processing**

In [202]:
# Initialize dictionaries to store keys for DataFrames with and without "Supplier"
supplier_present_keys = {}
supplier_absent_keys = {}

# Loop through all PDF files in the folder
for filename in os.listdir(pdf_folder_path):
    try:
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder_path, filename)

            # Read all tables from the specified page(s)
            tables = tabula.read_pdf(pdf_path, lattice=True, pages='all', multiple_tables=True)

            # Ensure there is at least one table
            if not tables:
                raise ValueError(f"No tables found in the PDF: {filename}")

            # Process the first table (assuming it has headers)
            first_table = tables[0]

            # Extract column names from the first table
            first_table_columns = first_table.columns

            # Initialize a list to store processed DataFrames
            processed_tables = [first_table]

            # Process each subsequent table
            for table in tables[1:]:
                # Convert the header row of the current table to a DataFrame row
                table_header_as_row = pd.DataFrame([table.columns.tolist()], columns=table.columns)
                
                # Append the header row to the current table data
                table_with_header_as_row = pd.concat([table_header_as_row, table], ignore_index=True)

                # Check if the number of columns matches
                if len(table_with_header_as_row.columns) != len(first_table_columns):
                    # Align columns by adding missing columns with NaN values
                    missing_cols = len(first_table_columns) - len(table_with_header_as_row.columns)
                    if missing_cols > 0:
                        # Add missing columns with NaN values
                        table_with_header_as_row = pd.concat(
                            [table_with_header_as_row, pd.DataFrame(columns=[f"Column_{i+1}" for i in range(missing_cols)])], 
                            axis=1
                        )
                    elif missing_cols < 0:
                        # Truncate extra columns
                        table_with_header_as_row = table_with_header_as_row.iloc[:, :len(first_table_columns)]
                
                # Rename columns of the current table to match the first table
                table_with_header_as_row.columns = first_table_columns
                
                # Append the table to the list of processed tables
                processed_tables.append(table_with_header_as_row)

            # Combine all tables into a single DataFrame, stacking vertically
            df = pd.concat(processed_tables, ignore_index=True)

            # Check if any of the column headers contain the word "Supplier"
            if any('Supplier' in col or 'Power' in col for col in df.columns):
                # Store the key in supplier_present_keys dictionary
                supplier_present_keys[filename] = df
            else:
                # Store the key in supplier_absent_keys dictionary
                supplier_absent_keys[filename] = df

    except Exception as e:
        print(f"Error processing file {filename}: {e}")

# Output the keys for verification
print("Files with 'Supplier' in headers:", supplier_present_keys.keys())
print("Files without 'Supplier' in headers:", supplier_absent_keys.keys())

Files with 'Supplier' in headers: dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf',

#### for dataframes w/ column headers

In [245]:
supplier_present_keys.keys()

dict_keys(['Generation-Charge_June-2024_Web-Upload_revised0618.pdf', 'Generation-Charge_July2024_Web-Upload.pdf', 'Generation-Charge_Jan-2024_Web-Upload.pdf', 'Generation-Charge-for-June-2022.pdf', 'Generation-Charge-for-July-2022.pdf', 'Generation-Charge-for-November-2022.pdf', 'Generation-Charge_Sep-2023_Web-Upload.pdf', 'Generation-Charge_Oct-2023_Web-Upload.pdf', 'Generation-Charge-for-November-2021.pdf', 'Generation-Charge_Dec-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2023.pdf', 'Generation-Charge_Nov-2023_Web-Upload.pdf', 'Generation-Charge-for-March-2022.pdf', 'Generation-Charge-for-April-2023.pdf', 'Generation-Charge-for-April-2022.pdf', 'Generation-Charge_Feb-2024_Web-Upload.pdf', 'Generation-Charge_June-2023_Web-Upload.pdf', 'Generation-Charge_Mar-2024_Web-Upload.pdf', 'Generation-Charge-for-September-2021.pdf', 'Generation-Charge-for-February-2023.pdf', 'Generation-Charge-for-February-2022.pdf', 'Generation-Charge-for-May-2022.pdf', 'Generation-Charge-for-August-202

In [283]:
dataframes = []

from datetime import datetime

def extract_date_key(filename):
    # Define patterns for different filename formats
    patterns = [
        r'(\w+)-(\d{4})',        # Matches "Month-YYYY" (e.g., "June-2024")
        r'(\d{4})-(\d{2})',      # Matches "YYYY-MM" (e.g., "2024-06")
        r'(\d{2})-(\d{2})',      # Matches "MM-YY" (e.g., "06-24")
        r'(\d{4})(\d{2})',       # Matches "YYYYMM" (e.g., "202406")
        r'(\d{2})-(\d{4})',      # Matches "MM-YYYY" (e.g., "06-2024")
        r'(\d{4})-(\w+)',        # Matches "YYYY-Month" (e.g., "2024-July")
        r'(\w+)(\d{4})'          # Matches "MonthYYYY" (e.g., "July2024")
    ]
    
    # Define month abbreviation to number mapping
    month_abbr_to_num = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12',
        'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
        'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
        'November': '11', 'December': '12'
    }
    
    # Extract text after the first underscore and remove file extension
    parts = filename.split('_', 1)
    filename = parts[1] if len(parts) > 1 else filename
    filename = re.sub(r'\.pdf$', '', filename)
    
    # Try to find a match with each pattern
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            groups = match.groups()
            if len(groups) == 2:
                month, year = groups
                if month.isdigit():
                    # Format: YYYY-MM, MM-YY, or YYYYMM
                    if len(month) == 2:  # MM or YY
                        return f"{month}-{year[-2:]}"  # MM-YY format
                    else:  # YYYYMM
                        return f"{month[-2:]}-{year[-2:]}"  # MM-YY format
                elif month in month_abbr_to_num:
                    # Handle formats with Month-YYYY or YYYY-Month
                    month_num = month_abbr_to_num[month]
                    return f"{month_num}-{year[-2:]}"  # MM-YY format
            elif len(groups) == 3:
                month, year = groups[1], groups[2]
                # Handle formats with Month-YYMM
                if month in month_abbr_to_num:
                    month_num = month_abbr_to_num[month]
                    return f"{month_num}-{year[-2:]}"  # MM-YY format
            elif len(groups) == 1 and groups[0] in month_abbr_to_num:
                # Handle formats with MonthYYYY
                month_num = month_abbr_to_num[groups[0]]
                return f"{month_num}-{year[-2:]}"  # MM-YY format

    # If no pattern matches, return an error
    raise ValueError(f"Date not found in filename: {filename}")

# Process each file in the dictionary
for filename, df in supplier_present_keys.items():
    # Slicing columns
    keywords = ["Power", "%", "Purchased", "Total"]
    filtered_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]
    sliced_columns = df[filtered_columns]

    # Slicing rows
    first_non_null_idx = sliced_columns[sliced_columns.iloc[:, 1].notna()].index[0]
    wesm_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("WESM", na=False, case=False)].index
    total_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("TOTAL", na=False, case=False)].index

    if not wesm_row_idx.empty and not total_row_idx.empty:
        wesm_row_idx = wesm_row_idx[0]
        total_row_idx = total_row_idx[0]

        sliced_rows = sliced_columns.loc[first_non_null_idx:wesm_row_idx]
        
        if total_row_idx > wesm_row_idx:
            total_row = sliced_columns.loc[[total_row_idx]]
            sliced_rows = pd.concat([sliced_rows, total_row], ignore_index=True)

    # Clean the "Power Supplier" column
    def clean_power_supplier(value):
        if pd.isnull(value):
            return value
        return re.sub(r'^\d+\.\s*', '', value)
    
    sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)
    sliced_rows = sliced_rows.dropna(subset=[sliced_rows.columns[0]])
    sliced_rows = sliced_rows.reset_index(drop=True)

    # Calculate the Average Generation Cost
    def calculate_average_generation_cost(row):
        try:
            total_cost = float(row['Total Generation Cost'].replace(',', ''))
            kwh = float(row['kWh'].replace(',', ''))
            return total_cost / kwh
        except (ValueError, TypeError):
            return np.nan

    if 'Total Generation Cost' in sliced_rows.columns and 'kWh' in sliced_rows.columns:
        sliced_rows['Average Generation Cost'] = sliced_rows.apply(calculate_average_generation_cost, axis=1)
        sliced_rows = sliced_rows.drop(columns=['Total Generation Cost'])

    # Rename the columns
    sliced_rows.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

    # Add the Date column
    date_str = extract_date_key(filename)
    if date_str:
        sliced_rows['Date'] = date_str

    # Append the DataFrame to the list
    dataframes.append(sliced_rows)

# Concatenate all DataFrames into one
big_df = pd.concat(dataframes, ignore_index=True)

big_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Date
0,Sem Calaca Power Corp.,25.84%,17825000.00,82160418.62,06-24
1,KEPCO SPC Power Corp.,20.58%,14200000.00,69175290.48,06-24
2,Energy Development Corp,14.05%,9692000.00,61514203.44,06-24
3,Spot Market (WESM),39.30%,27109551.24,61868333.90,06-24
4,SUBTOTAL for Power Suppliers,100.00%,68988059.24,275737392.40,06-24
5,Sem Calaca Power Corp.,29.98%,18125000.00,87651359.32,07-24
6,KEPCO SPC Power Corp.,23.99%,14500000.00,71508884.44,07-24
7,Energy Development Corp,16.01%,9678000.00,59823084.60,07-24
8,Spot Market (WESM),,,-,07-24
9,SUBTOTAL for Power Suppliers,100.00%,60453542.35,457639313.23,07-24


In [281]:
import re

def extract_date_key(filename):
    # Define patterns for different filename formats
    patterns = [
        r'(\w+)-(\d{4})',        # Matches "Month-YYYY" (e.g., "June-2024")
        r'(\d{4})-(\d{2})',      # Matches "YYYY-MM" (e.g., "2024-06")
        r'(\d{2})-(\d{2})',      # Matches "MM-YY" (e.g., "06-24")
        r'(\d{4})(\d{2})',       # Matches "YYYYMM" (e.g., "202406")
        r'(\d{2})-(\d{4})',      # Matches "MM-YYYY" (e.g., "06-2024")
        r'(\d{4})-(\w+)',        # Matches "YYYY-Month" (e.g., "2024-July")
        r'(\w+)(\d{4})'          # Matches "MonthYYYY" (e.g., "July2024")
    ]
    
    # Define month abbreviation to number mapping
    month_abbr_to_num = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12',
        'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
        'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
        'November': '11', 'December': '12'
    }
    
    # Extract text after the first underscore and remove file extension
    parts = filename.split('_', 1)
    filename = parts[1] if len(parts) > 1 else filename
    filename = re.sub(r'\.pdf$', '', filename)
    
    # Try to find a match with each pattern
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            groups = match.groups()
            if len(groups) == 2:
                month, year = groups
                if month.isdigit():
                    # Format: YYYY-MM, MM-YY, or YYYYMM
                    if len(month) == 2:  # MM or YY
                        return f"{month}-{year[-2:]}"  # MM-YY format
                    else:  # YYYYMM
                        return f"{month[-2:]}-{year[-2:]}"  # MM-YY format
                elif month in month_abbr_to_num:
                    # Handle formats with Month-YYYY or YYYY-Month
                    month_num = month_abbr_to_num[month]
                    return f"{month_num}-{year[-2:]}"  # MM-YY format
            elif len(groups) == 3:
                month, year = groups[1], groups[2]
                # Handle formats with Month-YYMM
                if month in month_abbr_to_num:
                    month_num = month_abbr_to_num[month]
                    return f"{month_num}-{year[-2:]}"  # MM-YY format
            elif len(groups) == 1 and groups[0] in month_abbr_to_num:
                # Handle formats with MonthYYYY
                month_num = month_abbr_to_num[groups[0]]
                return f"{month_num}-{year[-2:]}"  # MM-YY format

    # If no pattern matches, return an error
    raise ValueError(f"Date not found in filename: {filename}")

# Example usage
filenames = [
    "Generation-Charge_June-2024_Web-Upload_revised0618.pdf",
    "Generation-Charge_July2024_Web-Upload.pdf",
    "Generation-Charge_Jan-2024_Web-Upload.pdf",
    "Generation-Charge_June-2023_Web-Upload.pdf",
    "Generation-Charge_July-2023_Web-Upload.pdf",
    "Generation-Charge-for-September2022.pdf",
    "GenCharge_202101.pdf",
    "GenCharge_202102.pdf",
    "GenCharge_202103.pdf",
    "GenCharge_202107.pdf",
    "Generation-Charge_August-2023_Web-Upload.pdf",
    "revised0618.pdf",
    "Web-Upload.pdf"
]

for filename in filenames:
    try:
        date_key = extract_date_key(filename)
        print(f"Filename: {filename} -> Date extracted: {date_key}")
    except ValueError as e:
        print(e)


June-2024_Web-Upload_revised0618.pdf
Filename: Generation-Charge_June-2024_Web-Upload_revised0618.pdf -> Date extracted: 06-24
July2024_Web-Upload.pdf
Filename: Generation-Charge_July2024_Web-Upload.pdf -> Date extracted: 07-24
Jan-2024_Web-Upload.pdf
Filename: Generation-Charge_Jan-2024_Web-Upload.pdf -> Date extracted: 01-24
June-2023_Web-Upload.pdf
Filename: Generation-Charge_June-2023_Web-Upload.pdf -> Date extracted: 06-23
July-2023_Web-Upload.pdf
Filename: Generation-Charge_July-2023_Web-Upload.pdf -> Date extracted: 07-23
Generation-Charge-for-September2022.pdf
Filename: Generation-Charge-for-September2022.pdf -> Date extracted: 09-22
202101.pdf
Filename: GenCharge_202101.pdf -> Date extracted: 21-01
202102.pdf
Filename: GenCharge_202102.pdf -> Date extracted: 21-02
202103.pdf
Filename: GenCharge_202103.pdf -> Date extracted: 21-03
202107.pdf
Filename: GenCharge_202107.pdf -> Date extracted: 21-07
August-2023_Web-Upload.pdf
Filename: Generation-Charge_August-2023_Web-Upload.pdf 

#### for dataframes w/o column headers

In [191]:
supplier_absent_keys.keys()

dict_keys(['GenCharge_Mar2020.pdf', 'GenCharge_Jul2020.pdf', 'GenCharge_May2020.pdf', 'GenCharge_Aug2020.pdf', 'GenCharge_Oct2020.pdf', 'GenCharge_Sep2020.pdf', 'GenCharge_Apr2020.pdf', 'GenCharge_Dec2020.pdf', 'GenCharge_202101.pdf', 'GenCharge_202102.pdf', 'GenCharge_Nov2020.pdf', 'GenCharge_202103.pdf', 'GenCharge_202107.pdf', 'GenCharge_202106.pdf', 'GenCharge_202104.pdf', 'GenCharge_202105.pdf'])

In [199]:
# Function to extract and format the date from the filename
def extract_date_from_filename(filename):
    # Use regular expressions to match different date formats
    match = re.search(r'(\d{2,4})(?:-|_|-Web-Upload)?(?:_(\d{2,4}))?', filename)
    if match:
        # Handle different formats
        if len(match.group(1)) == 4:  # e.g., 2024
            year = match.group(1)
            month = match.group(2) if match.group(2) else '01'
        elif len(match.group(1)) == 2:  # e.g., 06
            year = '20' + match.group(2)  # Assuming year is in the 2000s
            month = match.group(1)
        else:
            return None
        return f"{month}-{year[-2:]}"
    return None

# Iterate over each item in the dictionary
for key, df in supplier_absent_keys.items():
    # Define the keywords to search for
    keywords = ["CONTRACTS", "%", "Purchased", "Average"]

    # Create a list to store the column indices that match the criteria
    matching_columns = []

    # Find columns where the keywords are present as row values
    for keyword in keywords:
        if keyword == "Purchased":
            # For "Purchased", we need to handle the second occurrence
            purchased_columns = df.apply(lambda col: col.str.contains(keyword, na=False)).any()
            # Get the indices of all columns with "Purchased"
            purchased_indices = purchased_columns[purchased_columns].index.tolist()
            if len(purchased_indices) > 1:
                # Select the second occurrence
                matching_columns.append(purchased_indices[1])
        else:
            # For all other keywords, find columns containing the keyword as a row value
            matching_columns += df.columns[df.apply(lambda col: col.str.contains(keyword, na=False)).any()].tolist()

    # Drop duplicates in case there are overlaps in columns
    matching_columns = pd.Index(matching_columns).unique()

    # Create the new DataFrame with the selected columns
    sliced_columns = df[matching_columns]

    # Attempt to find the index of the row containing the word "CONTRACTS" in the first column
    contract_indices = sliced_columns[sliced_columns.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index

    if not contract_indices.empty:
        # If "CONTRACTS" is found, use it as the start index
        start_index = contract_indices[0] + 1
    else:
        # If "CONTRACTS" is not found, find the first non-null value in the first column
        first_non_null_index = sliced_columns[sliced_columns.iloc[:, 0].notnull()].index[0]
        start_index = first_non_null_index  # Include this row as well

    # Find the index of the row containing the word "WESM" in the first column
    wesm_index = sliced_columns[sliced_columns.iloc[:, 0].str.contains(r'WESM', case=False, na=False)].index[0]

    # Slice the DataFrame to include rows between start_index and end_index (inclusive)
    sliced_rows = sliced_columns.iloc[start_index:wesm_index + 1]

    # Exclude the row that contains the word "Metering" in the first column
    sliced_rows = sliced_rows[~sliced_rows.iloc[:, 0].str.contains('Metering', case=False, na=False)]

    # Uncomment and adjust the following line if you need to rename columns
    sliced_rows.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

    # Function to clean the "Power Supplier" column
    def clean_power_supplier(value):
        # Remove leading numbers and periods
        value = re.sub(r'^\d+\.\s*', '', value)
        return value

    # Apply the cleaning function to the first column of sliced_rows
    sliced_rows['Power Supplier'] = sliced_rows['Power Supplier'].apply(clean_power_supplier)

    # Find the column that contains the word "Average"
    average_column = sliced_columns.columns[sliced_columns.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]

    # Find rows where the first column contains the word "TOTAL"
    total_row = sliced_columns.loc[sliced_columns.iloc[:, 0].str.contains("TOTAL", case=False, na=False), average_column]
    reversed_row = total_row[::-1]

    for value in reversed_row:
        if pd.notnull(value):
            total_row_value = value
            break  # Exit loop once a non-null value is found

    # Add the "Generation Charge" to the sliced_rows DataFrame
    sliced_rows["Generation Charge"] = total_row_value

    # Add the date column
    date_str = extract_date_from_filename(key)
    sliced_rows['Date'] = date_str

    # Append to big_df
    big_df = pd.concat([big_df, sliced_rows], ignore_index=True)

big_df

GenCharge_Mar2020.pdf
GenCharge_Jul2020.pdf
GenCharge_May2020.pdf
GenCharge_Aug2020.pdf
GenCharge_Oct2020.pdf
GenCharge_Sep2020.pdf
GenCharge_Apr2020.pdf
GenCharge_Dec2020.pdf
GenCharge_202101.pdf


ValueError: Length mismatch: Expected axis has 2 elements, new values have 4 elements

#### for troubleshooting loop

##### for dataframes w/ column headers

In [233]:
df = supplier_present_keys["GenCharge_202101.pdf"]
df

Unnamed: 0,Power Source,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Basic\rGeneration Cost\rPhP,Other Cost Adjustment\rPhP,Total\rGeneration Cost\rPhP,Average\rGeneration Cost\rPhP/kWh
0,,,(A),(B),(C),(D = B+C),(D/A)
1,BILATERAL CONTRACTS,,,,,,
2,1. Panay Energy Development Corporation U1/U2,39.73%,17735739.34,116168975.43,,116168975.43,6.5500
3,2. Panay Energy Development Corporation U3,29.11%,12995729.57,68873193.85,,68873193.85,5.2997
4,3. Panay Power Corporation,0.02%,8000.00,25228998.40,,25228998.40,3153.6
5,4. KEPCO SPC Power Corporation,8.06%,3600000.00,16801590.79,,16801590.79,4.6671
6,"5. Aboitiz Power Renewables, Incorporated",6.55%,2925000.00,20733592.50,,20733592.50,7.0884
7,Subtotal - Interim Power Supply Contracts,83.48%,37264468.91,247806350.97,-,247806350.97,6.6499
8,,,,,,,
9,Wholesale Electricity Spot Market (WESM),16.12%,7196668.13,16308643.53,6774450.35,23083093.88,3.2075


In [234]:
#Slicing columns

# Define the keywords to look for in the column names
keywords = ["Power", "%", "Purchased", "Total"]

# Filter the columns based on the presence of any of the keywords
filtered_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]

# Create the new DataFrame with the filtered columns
sliced_columns = df[filtered_columns]

sliced_columns

Unnamed: 0,Power Source,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Total\rGeneration Cost\rPhP
0,,,(A),(D = B+C)
1,BILATERAL CONTRACTS,,,
2,1. Panay Energy Development Corporation U1/U2,39.73%,17735739.34,116168975.43
3,2. Panay Energy Development Corporation U3,29.11%,12995729.57,68873193.85
4,3. Panay Power Corporation,0.02%,8000.00,25228998.40
5,4. KEPCO SPC Power Corporation,8.06%,3600000.00,16801590.79
6,"5. Aboitiz Power Renewables, Incorporated",6.55%,2925000.00,20733592.50
7,Subtotal - Interim Power Supply Contracts,83.48%,37264468.91,247806350.97
8,,,,
9,Wholesale Electricity Spot Market (WESM),16.12%,7196668.13,23083093.88


In [235]:
#Slicing rows

# Step 1: Find the first non-null row index in the first column
first_non_null_idx = sliced_columns[sliced_columns.iloc[:, 1].notna()].index[0]

# Step 2: Find the row index where the value in the first column is "WESM"
wesm_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("WESM", na=False, case=False)].index

# Step 3: Find the row index where the value in the first column is "TOTAL"
total_row_idx = sliced_columns[sliced_columns.iloc[:, 0].str.contains("TOTAL", na=False, case=False)].index

# Step 4: Slice the DataFrame to include the first non-null row, up to "WESM", and include "TOTAL"
if not wesm_row_idx.empty and not total_row_idx.empty:
    wesm_row_idx = wesm_row_idx[0]  # Get the index of the "WESM" row
    total_row_idx = total_row_idx[0]  # Get the index of the "TOTAL" row
    
    # Slice rows from the first non-null row to the "WESM" row (inclusive)
    sliced_rows = sliced_columns.loc[first_non_null_idx:wesm_row_idx]
    
    # Append the "TOTAL" row if it's not already in the slice
    if total_row_idx > wesm_row_idx:
        total_row = sliced_columns.loc[[total_row_idx]]
        sliced_rows = pd.concat([sliced_rows, total_row], ignore_index=True)

sliced_rows.columns = ["Power Supplier", "%", "kWh", "Total Generation Cost"]
sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
2,1. Panay Energy Development Corporation U1/U2,39.73%,17735739.34,116168975.43
3,2. Panay Energy Development Corporation U3,29.11%,12995729.57,68873193.85
4,3. Panay Power Corporation,0.02%,8000.0,25228998.4
5,4. KEPCO SPC Power Corporation,8.06%,3600000.0,16801590.79
6,"5. Aboitiz Power Renewables, Incorporated",6.55%,2925000.0,20733592.5
7,Subtotal - Interim Power Supply Contracts,83.48%,37264468.91,247806350.97
8,,,,
9,Wholesale Electricity Spot Market (WESM),16.12%,7196668.13,23083093.88


In [236]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    if pd.isnull(value):
        return value  # Return null if the value is null
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_rows
sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)

# Drop rows where the first column value is null
sliced_rows = sliced_rows.dropna(subset=[sliced_rows.columns[0]])

# Reset the index after dropping rows
sliced_rows = sliced_rows.reset_index(drop=True)

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
0,Panay Energy Development Corporation U1/U2,39.73%,17735739.34,116168975.43
1,Panay Energy Development Corporation U3,29.11%,12995729.57,68873193.85
2,Panay Power Corporation,0.02%,8000.0,25228998.4
3,KEPCO SPC Power Corporation,8.06%,3600000.0,16801590.79
4,"Aboitiz Power Renewables, Incorporated",6.55%,2925000.0,20733592.5
5,Subtotal - Interim Power Supply Contracts,83.48%,37264468.91,247806350.97
6,Wholesale Electricity Spot Market (WESM),16.12%,7196668.13,23083093.88


In [237]:
# Find the row where the value in the first column is "TOTAL"
total_row = sliced_rows[sliced_rows.iloc[:, 0].str.contains("TOTAL", na=False)]

# Extract the values for "Total Generation Cost" and "kWh"
if not total_row.empty:
    total_generation_cost_str = total_row['Total Generation Cost'].values[0]
    kwh_purchased_str = total_row['kWh'].values[0]

    # Convert the string values to float by removing commas
    total_generation_cost = float(total_generation_cost_str.replace(',', ''))
    kwh_purchased = float(kwh_purchased_str.replace(',', ''))

    # Calculate the Average Generation Cost for each row
    def calculate_average_generation_cost(row):
        try:
            total_cost = float(row['Total Generation Cost'].replace(',', ''))
            kwh = float(row['kWh'].replace(',', ''))
            return total_cost / kwh
        except (ValueError, TypeError):
            return np.nan

    sliced_rows['Average Generation Cost'] = sliced_rows.apply(calculate_average_generation_cost, axis=1)

    # Drop the "Total Generation Cost" column
    sliced_rows = sliced_rows.drop(columns=['Total Generation Cost'])

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Total Generation Cost
0,Panay Energy Development Corporation U1/U2,39.73%,17735739.34,116168975.43
1,Panay Energy Development Corporation U3,29.11%,12995729.57,68873193.85
2,Panay Power Corporation,0.02%,8000.0,25228998.4
3,KEPCO SPC Power Corporation,8.06%,3600000.0,16801590.79
4,"Aboitiz Power Renewables, Incorporated",6.55%,2925000.0,20733592.5
5,Subtotal - Interim Power Supply Contracts,83.48%,37264468.91,247806350.97
6,Wholesale Electricity Spot Market (WESM),16.12%,7196668.13,23083093.88


##### for dataframes w/o column headers

In [200]:
df = supplier_absent_keys["GenCharge_202101.pdf"]
df

Unnamed: 0,Power Source,% Share on\rEnergy\rPurchased,Energy\rPurchased\rkWh,Basic\rGeneration Cost\rPhP,Other Cost Adjustment\rPhP,Total\rGeneration Cost\rPhP,Average\rGeneration Cost\rPhP/kWh
0,,,(A),(B),(C),(D = B+C),(D/A)
1,BILATERAL CONTRACTS,,,,,,
2,1. Panay Energy Development Corporation U1/U2,39.73%,17735739.34,116168975.43,,116168975.43,6.5500
3,2. Panay Energy Development Corporation U3,29.11%,12995729.57,68873193.85,,68873193.85,5.2997
4,3. Panay Power Corporation,0.02%,8000.00,25228998.40,,25228998.40,3153.6
5,4. KEPCO SPC Power Corporation,8.06%,3600000.00,16801590.79,,16801590.79,4.6671
6,"5. Aboitiz Power Renewables, Incorporated",6.55%,2925000.00,20733592.50,,20733592.50,7.0884
7,Subtotal - Interim Power Supply Contracts,83.48%,37264468.91,247806350.97,-,247806350.97,6.6499
8,,,,,,,
9,Wholesale Electricity Spot Market (WESM),16.12%,7196668.13,16308643.53,6774450.35,23083093.88,3.2075


In [201]:
# Define the keywords to search for
keywords = ["CONTRACTS", "%", "Purchased", "Average"]

# Create a list to store the column indices that match the criteria
matching_columns = []

# Find columns where the keywords are present as row values
for keyword in keywords:
    if keyword == "Purchased":
        # For "Purchased", we need to handle the second occurrence
        purchased_columns = df.apply(lambda col: col.str.contains(keyword, na=False)).any()
        # Get the indices of all columns with "Purchased"
        purchased_indices = purchased_columns[purchased_columns].index.tolist()
        if len(purchased_indices) > 1:
            # Select the second occurrence
            matching_columns.append(purchased_indices[1])
    else:
        # For all other keywords, find columns containing the keyword as a row value
        matching_columns += df.columns[df.apply(lambda col: col.str.contains(keyword, na=False)).any()].tolist()

# Drop duplicates in case there are overlaps in columns
matching_columns = pd.Index(matching_columns).unique()

# Create the new DataFrame with the selected columns
sliced_columns = df[matching_columns]

sliced_columns

Unnamed: 0,Power Source,% Share on\rEnergy\rPurchased
0,,
1,BILATERAL CONTRACTS,
2,1. Panay Energy Development Corporation U1/U2,39.73%
3,2. Panay Energy Development Corporation U3,29.11%
4,3. Panay Power Corporation,0.02%
5,4. KEPCO SPC Power Corporation,8.06%
6,"5. Aboitiz Power Renewables, Incorporated",6.55%
7,Subtotal - Interim Power Supply Contracts,83.48%
8,,
9,Wholesale Electricity Spot Market (WESM),16.12%


In [194]:
# Attempt to find the index of the row containing the word "CONTRACTS" in the first column
contract_indices = sliced_columns[sliced_columns.iloc[:, 0].str.contains('CONTRACTS', case=False, na=False)].index

if not contract_indices.empty:
    # If "CONTRACTS" is found, use it as the start index
    start_index = contract_indices[0] + 1
else:
    # If "CONTRACTS" is not found, find the first non-null value in the first column
    first_non_null_index = sliced_columns[sliced_columns.iloc[:, 0].notnull()].index[0]
    start_index = first_non_null_index  # Include this row as well

# Find the index of the row containing the word "WESM" in the first column
wesm_index = sliced_columns[sliced_columns.iloc[:, 0].str.contains(r'WESM', case=False, na=False)].index[0]

# Slice the DataFrame to include rows between start_index and end_index (inclusive)
sliced_rows = sliced_columns.iloc[start_index:end_index + 1]

# Exclude the row that contains the word "Metering" in the first column
sliced_rows = sliced_rows[~sliced_rows.iloc[:, 0].str.contains('Metering', case=False, na=False)]

# Uncomment and adjust the following line if you need to rename columns
sliced_rows.columns = ["Power Supplier", "%", "kWh", "Average Generation Cost"]

sliced_rows

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
3,1. Panay Energy Development Corporation,70.58%,39843995.26,6.2003
4,2. Panay Power Corporation,2.18%,1229643.2,26.778
5,3. KEPCO SPC Power Corporation,6.59%,3720000.0,4.6744
6,"4. Aboitiz Power Renewables, Incorporated",13.18%,7440000.0,4.1899
7,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897
8,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897


In [195]:
import re

# Function to clean the "Power Supplier" column
def clean_power_supplier(value):
    # Remove leading numbers and periods
    value = re.sub(r'^\d+\.\s*', '', value)
    return value

# Apply the cleaning function to the first column of sliced_df
sliced_rows.iloc[:, 0] = sliced_rows.iloc[:, 0].apply(clean_power_supplier)

sliced_rows_df = sliced_rows.reset_index(drop=True)
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost
0,Panay Energy Development Corporation,70.58%,39843995.26,6.2003
1,Panay Power Corporation,2.18%,1229643.2,26.778
2,KEPCO SPC Power Corporation,6.59%,3720000.0,4.6744
3,"Aboitiz Power Renewables, Incorporated",13.18%,7440000.0,4.1899
4,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897
5,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897


In [196]:
# Find the column that contains the word "Average"
average_column = sliced_columns.columns[sliced_columns.apply(lambda col: col.str.contains('Average', case=False, na=False)).any()][0]

# Find rows where the first column contains the word "TOTAL"
total_row = sliced_columns.loc[sliced_columns.iloc[:, 0].str.contains("TOTAL", case=False, na=False), average_column]
reversed_row = total_row[::-1]

for value in reversed_row:
    if pd.notnull(value):
        total_row_value = value
        break  # Exit loop once a non-null value is found

In [197]:
sliced_rows_df["Generation Charge"] = total_row_value
sliced_rows_df

Unnamed: 0,Power Supplier,%,kWh,Average Generation Cost,Generation Charge
0,Panay Energy Development Corporation,70.58%,39843995.26,6.2003,6.0729
1,Panay Power Corporation,2.18%,1229643.2,26.778,6.0729
2,KEPCO SPC Power Corporation,6.59%,3720000.0,4.6744,6.0729
3,"Aboitiz Power Renewables, Incorporated",13.18%,7440000.0,4.1899,6.0729
4,Subtotal - Emergency Power Supply Contracts,92.52%,52233638.46,6.2897,6.0729
5,Wholesale Electricity Spot Market (WESM),7.48%,4221024.01,3.3897,6.0729
