# Get PDF Text
Save only the relevant PDF text and information into a json file. Save the PDF text in a format the preserves headers and body text.

The previous iteration of this file is saved in the GitHub repo as proposal_differentiation_tool.ipynb

In [4]:
'''
USER INPUTS:
    directory - file location directory of the .pdf proposal files (to be converted to .json files of cleaned text) 
Note: The output directory is ./jsons/ so you must create a folder named jsons in the same location as this code file.
'''
directory = './downloaded/'


# Imports
import fitz
import json
import os
from re import findall
from re import sub


def differentiate_pdf_text(doc):
    ''' Find the text properties of body text for a pdf document, return those properties for later use. 
        The text properties that are most frequent are identified as body text.
        Inputs:
            doc - The PDF document, as read by the fitz module. 
        Outputs:
            max_ct - Integer of the word count for the body text w/ max word count
            body_text_props - Tuple of properties of the body text, saved as (font, font size)
    '''
    pages_to_be_deleted = []
    ct_by_props = {}
    for page_num in range(len(doc) - 1):
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None)
        text_holder = ""

        # Iterate through all the text entries to count words for each set of properties
        for entry in text_blocks: 
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text
                        # put into tuple - (font, size, color)
                        props = (span["font"], round(span["size"]))
                        text = span["text"]

                        ct = count_words(text_holder)

                        # Update dictionary
                        if props not in ct_by_props:
                            ct_by_props[props] = ct
                        else:
                            ct_by_props[props] = ct_by_props[props] + ct

                        # Update text holder & prev properties to new
                        text_holder = text
                        prev_props = props
                        
    max_ct, body_text_props = find_body_text(ct_by_props)                    
                        
    return max_ct, body_text_props


def count_words(s):
    '''Count characters in a string'''
    return len(findall(r'\w+', s))


def find_body_text(ct_by_props):
    '''Find the body text props with the maximum word count (in the dictionary)'''
    max_ct = 0
    max_ct_props = None
    for props in ct_by_props:
        props_ct = ct_by_props[props]
        if props_ct > max_ct:
            max_ct = props_ct
            max_ct_props = props
    return max_ct, max_ct_props


def print_body_text_props(ct, body_text_props):
    '''Print the body text properties'''
    font = body_text_props[0]
    font_size = body_text_props[1]

    print("Body text properties:")
    print("Font:", font)
    print("Font Size:", font_size)
    print("Number of words:", ct)
    print()
    return


def find_extraneous_pages(doc):
    ''' Finds page numbers in the document that are resumes OR introduction letters (without using ML classifier).
        Inputs:
            doc - The PDF document, as read by the fitz module
        Outputs:
            pages_to_be_deleted - List of page numbers to be deleted
    '''
    pages_to_be_deleted = []
    for page_num in range(len(doc) - 1):        
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None, None)
        text_holder = ""

        # Iterate through all the text entries to count words for each set of properties
        for entry in text_blocks: 
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text - (font, size, color)
                        props = (span["font"], round(span["size"]), span["color"])
                        text = span["text"]

                        ct = count_words(text_holder)

                        # Check if page is letter
                        text_lower = text.lower()
                        if (page_num <= 3) and (page_num not in pages_to_be_deleted):
                            if ("re:" in text_lower):
                                #print("Found letter, \"re:\", page", page_num)
                                pages_to_be_deleted.append(page_num)
                            elif ("sincerely" in text_lower):
                                #print("Found letter, \"sincerely\", page", page_num)
                                pages_to_be_deleted.append(page_num)
                        
                        # Check resume
                        if prev_props == props:
                            text_holder += text
                        else:
                            if is_resume_footer(text_holder, prev_props) and (page_num not in pages_to_be_deleted):
                                pages_to_be_deleted.append(page_num) 
                            text_holder = text
                            prev_props = props                   
                        
    return pages_to_be_deleted


def remove_extra_pages(doc, pages_to_remove = []):
    ''' Removes extraneous pages from the PDF by page numbers.
        Input:
            doc - The PDF document, as read by the fitz module. 
            pages_to_remove - list of page numbers to remove.
    '''
    # Remove resumes/letters by page number
    for page_num in reversed(pages_to_remove):
        doc.deletePage(page_num)
        
    # Find & remove appendix
    for page_num in range(len(doc) - 1):
        # Check the page word count (Low word count = title page)
        plain_text = doc[page_num].getText("text")
        page_wordcount = count_words(plain_text)
        if page_wordcount < 10:
            no_newlines_text = plain_text.replace("\n", " ").lower()
            # Check if appendix. 
            if no_newlines_text.startswith("table of contents"):
                pass
            elif "appendix" in no_newlines_text:
                # Beginning of the appendix. Remove all subsequent pgs
                doc.deletePageRange(page_num, len(doc) - 1)
                break
            
    # Remove the cover page
    doc.deletePage(0)
    
    return


def is_resume_footer(text, props):
    '''Checks if text is resume footer. Uses a built-in list of employee names & text properties to check.'''
    resume_footer_props = [('ArialNarrow-Bold', 9, 2576241), ('CIDFont+F3', 9, 2576241)]
    '''resume_names = ["LIZ MANASSEE", 
                    "TOM CONLIN, PE", "TOM CONLIN",
                    'JAMES "KEN" MCCARRON, PH. D.', 'JAMES "KEN" MCCARRON',
                    "ANGY CASAMENTO, PE", "ANGY CASAMENTO", 
                    "LUKE ARNOLD", "LUKE ARNOLD, PE", 
                    "RICHARD ARCHER, PE", "RICHARD ARCHER", 
                    "ROBERT SMITH, PE", "ROBERT SMITH", 
                    "CLARK ROBERTS, PE", "CLARK ROBERTS",
                    "ANGY CASAMENTO, PE", "ANGY CASAMENTO", 
                    "PAUL MOREAU, PE", "PAUL MOREAU", 
                    "ANDI SCHMID, PE", "ANDI SCHMID", 
                    "AARON LAUINGER",]'''
    if (props in resume_footer_props):
        return True
        '''text = text.strip("|, ")
        if (text in resume_names):
            return True'''
    return False


def remove_table_text(text_dict):
    '''Removes text that is from a table (short text, that is strings with length <= 40)'''
    keys = text_dict.keys()
    to_delete = []
    #print("deleting:")
    for k in keys:
        if len(text_dict[k]) <= 40:
            #print(text_dict[k])
            to_delete.append(k)

    for k in to_delete:
        del text_dict[k]
        
    return text_dict


def save_to_dict(doc, body_text_props):   
    ''' Using the previously-found body text properties, save the document into a dictionary.
        Input:
            doc - The PDF document, as read by the fitz module
            body_text_props - Tuple of properties of the body text, saved as (font, font size)
        Output:
            text_dict - Dictionary of PDF text, formatted as headers:body text
    '''
    text_dict = {}
    for page_num in range(len(doc) - 1):
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Group json entries by those that share the same color&font&size

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None)
        prev_type = None
        header = None
        text_holder = ""

        # Iterate through all the text entries
        for entry in text_blocks:
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text
                        props = (span["font"], round(span["size"]))
                        color = span["color"]
                        text = span["text"].strip()
                        
                        if (len(text) > 2):
                        # if text != "":
                            # Check if current line is body text                        
                            if (props == body_text_props) and (is_color_grayscale(color)) and (header != None):
                                prev_type = "body"
                                prev_props = props
                                try:
                                    text_dict[header] += " " + text
                                except KeyError:
                                    text_dict[header] = text
                            elif ((props[0].rstrip("-Bold")) == body_text_props[0]) and (header != None) \
                            and (props[1] == body_text_props[1]) and is_color_grayscale(color):
                                prev_type = "body"
                                prev_props = props
                                try:
                                    text_dict[header] += " " + text
                                except KeyError:
                                    text_dict[header] = text
                            else: 
                                # Not body text.
                                if prev_type == "header" and prev_props == props:
                                    header += " " + text
                                else:
                                    header = text
                                prev_type = "header"
                                prev_props = props
    return text_dict


def is_color_grayscale(srgb_color):
    ''' Check if the color is APPROXIMATELY gray.
        Input: 
            srgb_color - standard RGB color code
        Output: 
            Boolean (True if approx. grayscale, False otherwise)
    '''
    # Is the color approximately grayscale?
    rgb = get_rgb_color(srgb_color)
    if max(rgb) - min(rgb) <= 40:
        return True
    return False


def get_rgb_color(srgb_color_code):
    ''' Get RGB color code
        Input: 
            srgb_color - standard RGB color code
        Output: 
            cc - (R, G, B) tuple color code
    '''
    cc = fitz.sRGB_to_rgb(srgb_color_code)
    return cc


def get_text(doc):
    ''' Get the PDF's text as a dictionary.
        Input:
            doc - The PDF document, as read by the fitz module
        Output:
            text_dict - dictionary containing the PDF text (in the format header:body text)
    '''
    # Remove appendix/CVs
    #print("Before appendix/CV removal:", len(doc), "pages")
    pages_to_remove = find_extraneous_pages(doc)
    remove_extra_pages(doc, pages_to_remove)
    #print("After removal:", len(doc), "pages")
    
    # Run function to identify what properties are plain text
    ct, body_text_props = differentiate_pdf_text(doc)

    # Go back through and save headers & body text to a dict
    text_dict = save_to_dict(doc, body_text_props)
    
    # Remove short entries from the dict (results of a table)
    text_dict = remove_table_text(text_dict)

    return text_dict


def main(directory):
    ''' Main driver function: Convert all pdfs in the directory to text files (json)'''
    # iterate over all PDF files in the directory
    for filename in os.listdir(directory):
        print(filename)
        f = os.path.join(directory, filename)
        doc = fitz.open(f)

        text_dict = get_text(doc)

        # Dump dictionary text to json file
        filename = filename.rstrip(".pdf")
        with open(("./jsons/" + filename + ".json"), "w") as outfile: 
            json.dump(text_dict, outfile)
            
    return print("Done")


main(directory)

17.00461 City of Brighton CO_Donelson Park Parking Lot_Apr17_P17.00461_FINAL.pdf
17.00532 Buena Vista CO_Midland Hills Bridge Trail Development_Apr17_P17.00532_FINAL_.pdf
17.00592 City of Fort Morgan_CentrePointe Plaza_May25_P17.00592_FINAL.pdf
17.01422 Adams County CO_ADA Transition Program_July17_P17.01422_FINAL.pdf
17.01998 Greenwood Village_Orchard Road_Dec17_P17.01998_FINAL_with letters.pdf
18.00308 Jefferson County_RoadwaySignage_Mar13_P18.00308_FINAL.pdf
18.00317 Greenwood Village_GravelRehab_March18_18.00317_Final.pdf
18.00333 MesaCounty_ERoad_March18_P18.00333_pagenumbers.pdf
18.00422 City of Brighton_On-Call _Mar18_P18.00422_FINAL.pdf
18.00575 City of Greenwood Village_TurnLaneExtension_P18.00575_FINAL2.pdf
18.00701 Adams County Pecos St 0518 P18.00701 FINAL.pdf
18.00701 Adams County Pecos St 1018 FINAL LR.pdf
18.00979 Mesa County_64.6 Design Build SOQ_July18_P18.00979_FINAL FOR PRINTING.pdf
18.01326 TownofParker_On-Call_Sept18_P18.01326_FINAL_LR.pdf
18.01326_TownofParker_On-