# City Budget Extractor

In [5]:
import PyPDF2
import pandas as pd
from pprint import pprint

After struggling for 20 minutes or so realized that the numbering in the PDF document don't match what is pulled by PyPDf2 because the budget PDF includes some "intro" pages that aren't counted by the GUI pdf reader but are counted by PyPDF2

In [4]:
# This PDF has some special formatting that offsets the page numbers
page_offset = 11
filename = "FY-19-20-Adopted-Budget.pdf"

reader = PyPDF2.PdfFileReader(filename)
page = reader.getPage(279 +11)

Lets take a look at what the extracted text looks like. Because its a PDF Im already expecting something terrible and that what is looks like we got. We back one large string that sort of goes across the page row by row. We'll need to split this apart using some custom logic,

For the headers I'm just going to write them down manually. While I could write some clever python it's really not worth it because there's only 6 headers, they're the same page to page, and I don't want the actual string from the text anyway.

In [6]:
HEADERS = ["FY2015/16_Actual", "FY2016/17_Actual", "FY2017/18_Actual",  "FY2018/19_Actual", "FY2018/19_Revised", "FY2019/20_Adopted"]

With the headers done lets extract each row. The pattern I see here is some string that tells us the fund type, an

1. String thas in  "All Funds", "General Fund
2. And all caps line that signifies the start of the block of expenses
3. The line item of expenses
4. 6 rows that are the actual budget expenses for my city



In [16]:
class Parser:
    """Parses the PDF file to grab the city expednitures as dates
    Performs the work in three passes
    
    1. Identifying the Block Headers and closing lines
    2. Identifying the row titles
    3. Parsing the row values and determining which ones can be parsed to valid row
    
    """
    
    HEADERS = ["FY2015/16_Actual", "FY2016/17_Actual",
                         "FY2017/18_Actual",  "FY2018/19_Actual",
                         "FY2018/19_Revised", "FY2019/20_Adopted"]
    
    def __init__(self, page=279, page_offset=11, filename="FY-19-20-Adopted-Budget.pdf"):
        """Gets page from pdf and page text values
        
        Notes
        -----
        This PDF has some special formatting that offsets the page numbers

        filename = "FY-19-20-Adopted-Budget.pdf"
        """
        self.page = reader.getPage(279 +11)
        
        # Split the text into discrete word units and clean up spacing
        
        # List Comprehension
        self.text = tuple([line.strip() for line in self.page.extractText().split("\n")])
        
        
        self.text = pd.Series(self.text)

        # Parse the budget numbers into numbers
        self.text = self.text.apply(self.coerce_numbers)
        
        
    def parse(self):
        raise NotImplementedError
        
    @staticmethod
    def is_header(line):
        
        if isinstance(line, int):
            return False
        else:
            return line.isupper() and "".join(line.split(" ")).isalpha()
    
    @staticmethod
    def coerce_numbers(line):
        """Tries parsing number strings into numbers, else return string"""
        
        # Check if number is negative with parantheses
        try:
            if line[0] == "(" and line[-1] ==")":
                neg_number = int("".join(line[1:-1].split(",")))
                return neg_number
        except IndexError:
            pass

        # Otherwise try plain logic
        try:
            return int("".join(line.split(",")))
        except ValueError:
            return line
        
    def parse_block_headers(self):
        """Identify the headers from the page as well as ending line"""
        current_header = {}
        headers = []
        
        for i, line in self.text.iteritems():
            if self.is_header(line):
                if line != current_header.get("header"):
                    current_header = {"header":line, "start":i}
                else:
                    assert line == current_header.get("header")
                    current_header["end"] = i
                    headers.append(current_header)
                    current_header = {}
            
        self.headers = pd.DataFrame(headers)
        return self.headers
        
    def parse_row_labels(self):
        """Identify the row labels from the page 
        
        Notes
        ----
        Row labels must appear after first header and before last header
        
        """
        
        rows = []
        for i, (header, start, end) in df.iterrows():
            
            row = {"header":header, "values":[]}
            
            # Get text block for this budget item block
            text_block = self.text[start+1:end]
            
            
            for line in text_block:
                
                # All rows should end with a % sign
                # TODO: There's still an issue where the line is not bookended always
                if "%" in str(line):
                    if len(row["values"]) == 6:
                        row["complete"] = True
                    else:
                        row["complete"] = False

                    # Explode numerical values and pair headers
                    numbers = row.pop("values")
                    for key, val in zip(self.HEADERS, numbers):
                        row[key] = val

                    rows.append(row)
                    row = {"header":header, "values":[]}
                    
                # If line is a string in this block its a row label
                elif isinstance(line, str):
                    row["line_item"] = line
                
                # Otherwise its 
                else:
                    assert isinstance(line, int)
                    row["values"].append(line)
            
        self.budget = pd.DataFrame(rows)
        return self.budget

        
    def parse_numbers(self):
        """Identify indices of valid numbers from the page"""

In [17]:
p = Parser()
df = p.parse_block_headers()
p.parse_row_labels()

Unnamed: 0,header,line_item,complete,FY2015/16_Actual,FY2016/17_Actual,FY2017/18_Actual,FY2018/19_Actual,FY2018/19_Revised,FY2019/20_Adopted
0,PERSONNEL SERVICES,"Salaries, Permanent",True,34323749,34654406,25765375,37010295,37033765,36135762
1,PERSONNEL SERVICES,"Salaries, Temporary",True,499772,420908,348015,367098,538702,367948
2,PERSONNEL SERVICES,"Salaries, Overtime",True,5007346,5043233,4093771,3953950,4372335,4049950
3,PERSONNEL SERVICES,Benefits,False,1466088,1550479,1079615,25343062,26926178,21666643
4,OPERATING EXPENSES,Utilities,True,17654,31687,30413,19500,19500,19500
5,OPERATING EXPENSES,Equipment and Supplies,True,1105697,1575090,935471,985254,1489857,1328684
6,OPERATING EXPENSES,Repairs and Maintenance,True,1106671,939054,752048,964510,986248,964510
7,OPERATING EXPENSES,Conferences and Training,True,344329,337535,308983,334105,335654,225767
8,OPERATING EXPENSES,Professional Services,True,503872,458393,391996,335825,735552,335825
9,OPERATING EXPENSES,Other Contract Services,True,1727604,1790163,1569292,2279087,2355534,2189087
