In [17]:
import fitz
import json
import pandas as pd

In [38]:
%%time
doc = fitz.open("./downloaded/20.02564 Grand Forks - East Grand Forks Traffic Impact Study - Final.pdf")

'''
Returns a dataframe w/ columns:
    Page (int zero-indexed), 
    Text (str), 
    Properties (list of tuples - tuples are (font, font size, color))
'''
def main(doc):    
    df = pd.DataFrame(columns = ["Page", "Text", "Properties"])
    
    for page_num in range(len(doc)):
        page_text, set_of_props = get_pdf_text(doc, page_num)
        list_of_props = list(set_of_props)
        df = df.append({"Page":page_num, "Text":page_text, "Properties":list_of_props}, ignore_index=True)
        
    return df

df = main(doc)
df.head(10)

Wall time: 1.37 s


Unnamed: 0,Page,Text,Properties
0,0,Grand Forks – East Grand Forks Metropolitan ...,"[(NewsGothicStd-Bold, 10.0, 16777215), (NewsGo..."
1,1,"Ulteig Fargo Office 3350 38th Ave S Fargo, ND...","[(NewsGothicStd, 11.0, 16777215), (ArialNarrow..."
2,2,Table of CONTENTS TRANSPORTATION PLANNING SE...,"[(ArialNarrow-Bold, 48.0, 2438725), (ArialNarr..."
3,3,SECTION A: INTRODUCTION & EXECUTIVE SUMMA...,"[(NewsGothicStd-Bold, 16.0, 16296473), (ArialN..."
4,4,SECTION B: RESPONSE TO ADMINISTRATIVE QUE...,"[(NewsGothicStd-Bold, 16.0, 16296473), (ArialN..."
5,5,Traffic Analysis Traffic Data Collection and ...,"[(ArialNarrow, 10.5, 16777215), (ArialNarrow-B..."
6,6,Traffic Forecasts for year 2030 and 2045 will...,"[(ArialNarrow, 10.5, 16777215), (NewsGothicStd..."
7,7,Multi-Modal Connectivity Ulteig will provide ...,"[(NewsGothicStd-Bold, 10.0, 14818598), (ArialN..."
8,8,Issues Identification/Purpose and Need Devel...,"[(ArialNarrow, 10.5, 2373188), (NewsGothicStd-..."
9,9,BRIDGE • Compatibility with existing flood ...,"[(ArialNarrow, 10.5, 2373188), (ArialNarrow-Bo..."


In [13]:
def get_pdf_text(doc, page_num):
    list_of_props = set()
    
    # Get page's text (json format)
    json_text = doc[page_num].getText("json")

    # Group json entries by those that share the same color&font&size

    # Convert string/json to usable format (dictionary/lists)
    json_text = json.loads(json_text)  
    text_blocks = json_text["blocks"]
    
    # Set previous properties to None to prepare for comparisons
    prev_props = (None, None, None)
    text_holder = ""

    # Iterate through all the text entries
    for entry in text_blocks:
        # Check if this block is an image
        try:
            img = entry["image"]
        except:
            img = None
            pass

        # Check if this block exists, or if the end of the page has been reached
        try: 
            lines = entry["lines"]
        except:
            lines = None
            pass

        # Line is not an image and contains text so we continue
        if lines != None and img == None:             
            for spans in lines:
                # Get the line data
                spans = spans["spans"]                

                for i in range(len(spans)):
                    span = spans[i]
                    
                    # Identify current line's properties and text
                    props = (span["font"], span["size"], span["color"])
                    text = span["text"]

                    # Check if current line has different properties from previous
                    if props != prev_props:
                        # Current line has different properties.
                        list_of_props.add(props)

                        # For now, just print whatever text had the previous properties
                        last_printed_text = text

                        # Reset previous properties
                        prev_props = props

                    text_holder += " " + text
                        
    if last_printed_text != text:
        text_holder += " " + text
  
    return text_holder, list_of_props