In [75]:
import pdfplumber
import pandas as pd
import matplotlib.pyplot as plt
import re
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

plt.style.use('default')

In [76]:
revenue_path = '../z_Data/ME_Revenue/FY 2025 Revenue ME.pdf'

In [77]:
# Extract table from page 6 as text
with pdfplumber.open(revenue_path) as pdf:
    exhibit_i_text = pdf.pages[5].extract_text()  # Page 6 is index 5

# Find table boundaries
lines = exhibit_i_text.split('\n')
start_idx = None
end_idx = None
for i, line in enumerate(lines):
    if line.startswith('Sales and Use Tax'):
        start_idx = i
    if line.startswith('NOTES:'):
        end_idx = i
        break

# Extract table lines
table_lines = [line.strip() for line in lines[start_idx:end_idx] if line.strip()]

# Parse table data - each row has 10 elements: source + 9 values
data = []
for i in range(0, len(table_lines), 10):
    if i + 9 < len(table_lines):
        row_lines = table_lines[i:i+10]
        source = row_lines[0]
        values = row_lines[1:]
        
        # Clean values: remove $, %, commas, handle negatives
        cleaned_values = []
        for val in values:
            val = val.replace('$', '').replace('%', '').replace(',', '')
            if val.startswith('(') and val.endswith(')'):
                val = '-' + val[1:-1]
            try:
                if '.' in val:
                    cleaned_values.append(float(val))
                else:
                    cleaned_values.append(int(val))
            except ValueError:
                cleaned_values.append(0)  # Default to 0 if not parseable
        
        row = [source] + cleaned_values
        data.append(row)

# Create DataFrame
columns = ['Source', 'Month Actual', 'Month Budget', 'Month Variance', 'Month %', 'FYTD Actual', 'FYTD Budget', 'FYTD Variance', 'FYTD %', 'Total Budgeted FY']
df_text = pd.DataFrame(data, columns=columns)
df_text

Unnamed: 0,Source,Month Actual,Month Budget,Month Variance,Month %,FYTD Actual,FYTD Budget,FYTD Variance,FYTD %,Total Budgeted FY
0,"Sales and Use Tax $ 203,877,545 $ 199,958,406 ...",0,0,0,0,0,0,0,0,0


In [102]:
def find_exhibit_page(pdf, exhibit_name):
    for i, page in enumerate(pdf.pages):
        lines = page.extract_text_lines()
        first_line = lines[0]['text'] if lines else ""
        if first_line.endswith(exhibit_name) or first_line.endswith(exhibit_name.upper()):
            return i
    return -1

In [115]:
# Extract table from page 6 as text
def extract_revenue_source_table(year):
    revenue_path = f'../z_Data/ME_Revenue/FY {year} Revenue ME.pdf'
    with pdfplumber.open(revenue_path) as pdf:
        exhibit_i_page = find_exhibit_page(pdf, 'Exhibit I')
        if( exhibit_i_page == -1):
            raise ValueError("Exhibit I page not found.")
        exhibit_i_text = pdf.pages[exhibit_i_page].extract_text()  # Page 6 is index 5

    # Find table boundaries
    lines = exhibit_i_text.split('\n')
    start_idx = None
    end_idx = None
    for i, line in enumerate(lines):
        if line.startswith('Sales and Use Tax'):
            start_idx = i
        if line.startswith('NOTES:'):
            end_idx = i
            break

    # Extract table lines
    table_lines = [line.strip() for line in lines[start_idx:end_idx] if line.strip()]

    data = []
    for line in table_lines:
        line = line.replace('$ ', '').replace('%', '').replace(',', '').replace('( ', '(').replace(' )', ')')
        line = re.sub(r'(?<![\d.])\b(\d)\s+(\d+)\b', r'\1\2', line) # Fix spaces in numbers that happens pre-2019
        line = line.split()
        line_values = line[len(line) - 9:]
    
        clean_values = ['-' + value[1:-1] if value.startswith('(') and value.endswith(')') else value for value in line_values ]
        clean_values = [np.nan if value == '-' or value == '' else value for value in clean_values]
            
        source = ' '.join(line[0:len(line) - 9])
        clean_values = [source] + clean_values
        data.append(clean_values)

    columns = ['Source', 'Month Actual', 'Month Budget', 'Month Variance', 'Month % Variance', 'FYTD Actual', 'FYTD Budget', 'FYTD Variance', 'FYTD % Variance', 'Total Budgeted FY']
    return pd.DataFrame(data, columns=columns)

FileNotFoundError: [Errno 2] No such file or directory: '../z_Data/ME_Revenue/FY 2015 Revenue ME.pdf'