## Proposal_Differentiation_Tool

Differentiate between Titles, Headers, Body paragraphs, etc.

# Save document (headers & body text) into a dictionary

### Initialize

In [1]:
import fitz
import json
from re import findall
from re import sub

### Main

In [140]:
%%time
doc = fitz.open("./downloaded/PDF.pdf")

def main(doc):
    # Run function to identify what properties are plain text
    ct, body_text_props = differentiate_pdf_text(doc)
    print()
    print_body_text_props(ct, body_text_props)

    # Remove appendix/CVs
    print("Before appendix/CV removal:", len(doc), "pages")
    resume_pages = find_resume_pages(doc)
    print(resume_pages)
    remove_appendix(doc, resume_pages)
    print("After removal:", len(doc), "pages")
    print()
    
    # Go back through and save headers & body text to a dict
    text_dict = save_to_dict(doc, body_text_props)
    '''for k in text_dict:
        print(k, "\n", text_dict[k], "\n")'''

    return

main(doc)


Body text properties:
Font: ArialNarrow
Font Size: 11
Color: rgb(34, 31, 31)
Number of words: 6423

Before appendix/CV removal: 41 pages
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
After removal: 29 pages

Wall time: 5.38 s


### Find body text properties

In [82]:
'''
Functions for finding the text properties of body text for a pdf document.
Save those properties for later use.
'''
def differentiate_pdf_text(doc):
    pages_to_be_deleted = []
    ct_by_props = {}
    for page_num in range(len(doc) - 1):
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None, None)
        text_holder = ""

        # Iterate through all the text entries to count words for each set of properties
        for entry in text_blocks: 
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text
                        # put into tuple - (font, size, color)
                        props = (span["font"], round(span["size"]), span["color"])
                        text = span["text"]

                        ct = count_words(text_holder)

                        # Update dictionary
                        if props not in ct_by_props:
                            ct_by_props[props] = ct
                        else:
                            ct_by_props[props] = ct_by_props[props] + ct

                        # Update text holder & prev properties to new
                        text_holder = text
                        prev_props = props
                        
    max_ct, body_text_props = find_body_text(ct_by_props)                    
                        
    return max_ct, body_text_props

In [75]:
def count_words(s):
    return len(findall(r'\w+', s))


def find_body_text(ct_by_props):
    max_ct = 0
    max_ct_props = None
    for props in ct_by_props:
        props_ct = ct_by_props[props]
        if props_ct > max_ct:
            max_ct = props_ct
            max_ct_props = props
    return max_ct, max_ct_props


def print_body_text_props(ct, body_text_props):
    font = body_text_props[0]
    font_size = body_text_props[1]
    font_color_code = body_text_props[2]

    print("Body text properties:")
    print("Font:", font)
    print("Font Size:", font_size)
    print("Color:", get_rgb_color(font_color_code))
    print("Number of words:", ct)
    print()
    return

### Remove extraneous pages (appendix/CVs)

In [136]:
'''Finds page numbers that are resumes'''
def find_resume_pages(doc):
    pages_to_be_deleted = []
    for page_num in range(len(doc) - 1):        
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None, None)
        text_holder = ""

        # Iterate through all the text entries to count words for each set of properties
        for entry in text_blocks: 
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text - (font, size, color)
                        props = (span["font"], round(span["size"]), span["color"])
                        text = span["text"]

                        ct = count_words(text_holder)

                        # Check
                        if prev_props == props:
                            text_holder += text
                        else:
                            if is_resume_footer(text_holder, prev_props):
                                pages_to_be_deleted.append(page_num) 
                            text_holder = text
                            prev_props = props                   
                        
    return pages_to_be_deleted

In [135]:
def is_resume_footer(text, props):
    ''' 
    If we can only remove resumes:
    1. Check whether a block of text is a resume footer (using text properties & name checking)
    2. Probably need to manually remove the resumes from old (pre-2020) pdfs, as they don't match format of new pdfs
    '''
    resume_footer_props = ('ArialNarrow-Bold', 9, 2576241)
    resume_names = ["LIZ MANASSEE", 
                    "TOM CONLIN, PE", "TOM CONLIN",
                    'JAMES "KEN" MCCARRON, PH. D.', 'JAMES "KEN" MCCARRON',
                    "ANGY CASAMENTO, PE", "ANGY CASAMENTO", 
                    "LUKE ARNOLD", "LUKE ARNOLD, PE", 
                    "RICHARD ARCHER, PE", "RICHARD ARCHER", 
                    "ROBERT SMITH, PE", "ROBERT SMITH", 
                    "CLARK ROBERTS, PE", "CLARK ROBERTS",
                    "ANGY CASAMENTO, PE", "ANGY CASAMENTO", 
                    "PAUL MOREAU, PE", "PAUL MOREAU", 
                    "ANDI SCHMID, PE", "ANDI SCHMID", 
                    "AARON LAUINGER",]
    if (props == resume_footer_props):
        text = text.strip("|, ")
        if (text in resume_names):
            return True
    return False

In [138]:
def remove_appendix(doc, resume_pages):
    for page_num in reversed(resume_pages):
        doc.deletePage(page_num)

    for page_num in range(len(doc) - 1):
        # Check the page word count (Low word count = title page)
        plain_text = doc[page_num].getText("text")
        page_wordcount = count_words(plain_text)
        if page_wordcount < 10:
            no_newlines_text = plain_text.replace("\n", " ").lower()
            # Check if appendix. 
            if no_newlines_text.startswith("table of contents"):
                pass
            elif "appendix" in no_newlines_text:
                # Beginning of the appendix. Remove all subsequent pgs
                doc.deletePageRange(page_num, len(doc) - 1)
                return
    return

### Save document as a dictionary to associate headers w/ body text

In [76]:
'''
Using the previously-found body text properties, save the document into a dictionary.
'''
def save_to_dict(doc, body_text_props):    
    text_dict = {}
    for page_num in range(len(doc) - 1):
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Group json entries by those that share the same color&font&size

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None, None)
        prev_text_type = None
        text_holder = ""

        # Iterate through all the text entries
        for entry in text_blocks:
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text
                        props = (span["font"], round(span["size"]), span["color"])
                        text = span["text"].strip()

                        # Check if current line is body text                        
                        if props == body_text_props:                           
                            try:
                                text_dict[header] += text
                            except KeyError:
                                text_dict[header] = text
                        else: 
                            # Not body text.
                            header = text
    return text_dict

#### misc. helper functions

In [39]:
def get_rgb_color(color_code):
    cc = fitz.sRGB_to_rgb(color_code)    
    cc = "".join(str(cc))
    cc = "rgb" + cc
    return cc

## Get All Text tool

In [35]:
# Select document & page number (0-indexed) to read
doc = fitz.open("./downloaded/.pdf")
page_num = 4

# Run function to get the plain text
get_pdf_text(doc, page_num)

 

Table of 

CONTENTS 

Section 1: Project TeamSection 2: Past Performance on Similar Projects/Similar TeamsSection 3: Project Controls & QualitySection 4: Project Concept & Critical IssuesSection 5: FeeAppendix A: Resumes


In [4]:
def get_pdf_text(doc, page_num):
    # Get page's text (json format)
    json_text = doc[page_num].getText("json")

    # Group json entries by those that share the same color&font&size

    # Convert string/json to usable format (dictionary/lists)
    json_text = json.loads(json_text)  
    text_blocks = json_text["blocks"]
    
    # Set previous properties to None to prepare for comparisons
    prev_color = None
    prev_font = None
    prev_size = None
    text_holder = ""

    # Iterate through all the text entries
    for entry in text_blocks:
        # Check if this block is an image
        try:
            img = entry["image"]
        except:
            img = None
            pass

        # Check if this block exists, or if the end of the page has been reached
        try: 
            lines = entry["lines"]
        except:
            lines = None
            pass

        # Line is not an image and contains text so we continue
        if lines != None and img == None:             
            for spans in lines:
                # Get the line data
                spans = spans["spans"]                

                for i in range(len(spans)):
                    span = spans[i]
                    
                    # Identify current line's properties and text
                    #font = span["font"]
                    color = span["color"]
                    size = span["size"]
                    text = span["text"]

                    # Check if current line has different properties from previous
                    if (color != prev_color) or (size != prev_size):
                        # Current line has different properties.

                        # For now, just print whatever text had the previous properties
                        last_printed_text = text_holder
                        print(text_holder, "\n")

                        # Reset text holder & prev_ properties
                        text_holder = text
                        #prev_font = font
                        prev_size = size
                        prev_color = color

                    else:
                        # Current line has the same properties. Add it to the text block.
                        text_holder += text
                        
    if last_printed_text != text_holder:
        print(text_holder)
  
    return

## Differentiation between Text and Headers

In [18]:
%%time
# Input document
doc = fitz.open("./downloaded/.pdf")

# Check if Table of Contents (bookmarks) exists for the PDF
toc = doc.getToC()
if len(toc) != 0:
    # Use ToC to differentiate
    pass

# Run function to identify what properties are plain text
ct_by_props = differentiate_pdf_text(doc)
#for key in ct_by_props:
#    print(key, "==", ct_by_props[key])

# Select body text by whatever set of text props has the largest wordcount
ct, body_text_props = find_body_text(ct_by_props)
font = body_text_props[0]
font_size = body_text_props[1]
font_color_code = body_text_props[2]

print("Body text properties:")
print("Font:", font)
print("Font Size:", font_size)
print("Color:", get_rgb_color(font_color_code))
print("Number of words:", ct)
print()

# Go back through and "tag" headers vs. body text


Body text properties:
Font: ArialNarrow
Font Size: 10
Color: rgb(34, 31, 31)
Number of words: 5017

Wall time: 773 ms


In [11]:
def count_words(s):
    return len(findall(r'\w+', s))

# Find properties of body text for a pdf document
def differentiate_pdf_text(doc):
    num_pages = len(doc)
    # Create a dictionary to hold word counts for each set of properties - props:ct
    ct_by_props = {}
        
    for page_num in range(num_pages):          
        # Get page's text (json format)
        json_text = doc[page_num].getText("json")

        # Convert string/json to usable format (dictionary/lists)
        json_text = json.loads(json_text)  
        text_blocks = json_text["blocks"]

        # Set previous properties to None to prepare for comparisons
        prev_props = (None, None, None)
        text_holder = ""

        # Iterate through all the text entries to count words for each set of properties
        for entry in text_blocks: 
            # Check if this block is an image
            try:
                img = entry["image"]
            except:
                img = None
                pass

            # Check if this block exists, or if the end of the page has been reached
            try: 
                lines = entry["lines"]
            except:
                lines = None
                pass

            # Line is not an image and contains text so we continue
            if lines != None and img == None:             
                for spans in lines:
                    # Get the line data
                    spans = spans["spans"]                

                    for i in range(len(spans)):
                        span = spans[i]

                        # Identify current line's properties and text
                        # put into tuple - (font, size, color)
                        props = (span["font"], round(span["size"]), span["color"])
                        text = span["text"]                      

                        ct = count_words(text_holder)

                        # Update dictionary
                        if props not in ct_by_props:
                            ct_by_props[props] = ct
                        else:
                            ct_by_props[props] = ct_by_props[props] + ct

                        # Update text holder & prev properties to new
                        text_holder = text
                        prev_props = props
    return ct_by_props

In [13]:
def get_rgb_color(color_code):
    cc = fitz.sRGB_to_rgb(color_code)    
    cc = "".join(str(cc))
    cc = "rgb" + cc
    return cc

cc = get_rgb_color(13027530)
print(cc)

rgb(198, 200, 202)


## Notes section (Misc. code used for debugging)

In [141]:
doc = fitz.open("./downloaded/PDF.pdf")
page_num = 37

def get_props_text(doc, page_num):
    # Get page's text (json format)
    json_text = doc[page_num].getText("json")

    # Convert string/json to usable format (dictionary/lists)
    json_text = json.loads(json_text)  
    text_blocks = json_text["blocks"]

    # Iterate through all the text entries to count words for each set of properties
    for entry in text_blocks: 
        # Check if this block is an image
        try:
            img = entry["image"]
        except:
            img = None
            pass

        # Check if this block exists, or if the end of the page has been reached
        try: 
            lines = entry["lines"]
        except:
            lines = None
            pass

        # Line is not an image and contains text so we continue
        if lines != None and img == None:             
            for spans in lines:
                # Get the line data
                spans = spans["spans"]                

                for i in range(len(spans)):
                    span = spans[i]

                    # Identify current line's properties and text
                    # put into tuple - (font, size, color)
                    props = (span["font"], round(span["size"]), span["color"])
                    text = span["text"]
                    text = text.strip("|")
                    print(props, "|", text)  
    return

get_props_text(doc, page_num)

mupdf: cannot open ./downloaded/PDF.pdf: No such file or directory


RuntimeError: cannot open ./downloaded/PDF.pdf: No such file or directory

In [79]:
# Check what style is like for json entries/spans/span/text
doc = fitz.open("./downloaded/.pdf")
page_num = 13

def print_first_entry(doc, page_num):
    # Get page's text (json format)
    json_text = doc[page_num].getText("json")

    # Convert string/json to usable format (dictionary/lists)
    json_text = json.loads(json_text)  
    text_blocks = json_text["blocks"]
    
    # Iterate through all the text entries
    for entry in text_blocks:
        
        lines = entry["lines"]
        print("LINES:\n", lines, "\n")

        if lines != None:
            for spans in lines:       
                # Get the line data
                spans = spans["spans"]
                
                print("SPANS:\n", spans, "\n")

                for i in range(len(spans)):
                    span = spans[i]
                    
                    print("SPAN:\n", span, "\n")
                    
                    # Identify current line's properties and text
                    text = span["text"]
                    print("TEXT:\n", text, "\n")
        break
    return

print_first_entry(doc,page_num)

mupdf: cannot open ./downloaded/.pdf: No such file or directory


RuntimeError: cannot open ./downloaded/.pdf: No such file or directory

In [72]:
doc = fitz.open("./downloaded/pdf.pdf")
page_num = 30

def get_json_text(doc, page_num):
    # Get page's text (json format)
    json_text = doc[page_num].getText("json")

    # Group json entries by those that share the same color&font&size

    # Convert string/json to usable format (dictionary/lists)
    json_text = json.loads(json_text)  
    text_blocks = json_text["blocks"]
    return print(text_blocks)

get_json_text(doc, page_num)

mupdf: cannot open ./downloaded/pdf.pdf: No such file or directory


RuntimeError: cannot open ./downloaded/pdf.pdf: No such file or directory

In [77]:
doc = fitz.open("./downloaded/PDF.pdf")
page_num = 4

def get_plain_text(doc, page_num):
    # Get page's text
    plain_text = doc[page_num].getText("text")
    no_newlines_text = plain_text.replace("\n", " ").lower()
    print(no_newlines_text)
    return

mupdf: cannot open ./downloaded/PDF.pdf: No such file or directory


RuntimeError: cannot open ./downloaded/PDF.pdf: No such file or directory

In [75]:
doc = fitz.open("./downloaded/.pdf")
page_num = 4

def handle_title_page(doc, page_num):
    plain_text = doc[page_num].getText("text")
    no_newlines_text = plain_text.replace("\n", " ").lower()
    print(no_newlines_text)
    
    # Check if ToC, or appendix.
    if no_newlines_text.startswith("table of contents"):
        print("toc")
    elif no_newlines_text.startswith("appendix"):
        print("appendix")
    else:
        print("neither")

mupdf: cannot open ./downloaded/.pdf: No such file or directory


RuntimeError: cannot open ./downloaded/.pdf: No such file or directory