In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import pandas as pd

### PDF converter function definition

In [2]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    setattr(laparams, 'all_texts', True)
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

#### Category Dictionary and Column Generation

In [3]:
#dictionary and categorizing
def categorize_strings(strings):
    categories = {
        'must': 'Requirement',
        'should': 'Requirement',
        'can': 'Optional Requirement',
        'may': 'Optional Requirement'
    }
    
    headerPattern0 = r'\d+[a-zA-Z]|[a-zA-Z]+\d+'
    categorized_strings = []
    
    for string in strings:
        categorized_string = {'category': None}
        count = 0
        dig = False
        for char in string:
            if char.isdigit():
                dig = True
            if char.isupper():
                count += 1
        if count > 12 or '..' in string or '--' in string or re.search(headerPattern0, string) is not None:
            categorized_string = {'category': 'Header'}
        else:
            for keyword, category in categories.items():
                if keyword in string.lower():
                    categorized_string['category'] = category
                    break

        categorized_strings.append(categorized_string)
    
    return categorized_strings

#### Division column Generation

In [57]:
def Division(strings, pattern):
    divided = []
    current_division = 'None'
    
    
    for string in strings:
        if re.match(pattern, string):
            current_division = string.split(' ',1)[0]
            current_division = {'Section': current_division}
        divided.append(current_division)
    return divided

#### Random Symbol Cleaning 

In [5]:
def symbolClean(text):
    #Cleaned up super random symbols
    uselessPatterns = ['""',"‘‘", "-'", ",‘", "--", "-‘","--'',","',",",,", ',-']
    for i in uselessPatterns:
        cleanedtext = text.replace(i, "")
    spaced = cleanedtext.split()
    spaced = ' '.join(spaced)
    return spaced

#### Lemmatization

In [6]:
# def lemmatization(text): 
#     ##Testing Lemmatization
#     #Not sure if it really worked
#     from nltk.stem import WordNetLemmatizer
#     stemmer = WordNetLemmatizer()

#     text = text.split()

#     lemmatizedtext = [stemmer.lemmatize(word) for word in text]
#     lemmatizedtext = ' '.join(text)
#     # Text.append(text)
#     return lemmatizedtext


#### Useless short portions of text removal

In [7]:
def shortparaRemove(text):
    #Remove Useless short sentences less than 200 character??
    for sentence in text:
        if len(sentence) > 200:
            text.remove(sentence)
    return text

#### Compiler

In [33]:
def compile(Source, CleanText, Categories, Sections, orgText, cleantext0, cleantext2):
    end_text = []
    for string in CleanText:
        stringy = {'Text': string}
        end_text.append(stringy)
    #okay this is the long way, not sure how 
    #I want to do it better but there is a way
    finished = []

    for index in range(len(CleanText)):
        element = {}
        
        identifier = {'ID': index}
        element.update(identifier)
        
        if isinstance(Source, dict):
            element.update(Source)
            
        if index < 1:
            element.update({'Original Text': orgText})
            element.update({'Better Original Text': orgText.replace("\n", " ")})
            element.update({'First Clean (Random Symbol Removal)': cleantext0 })
            element.update({'Second Clean (Short Sentence Removal\ Sentence Splitting)': cleantext2})
            
            
            
        if isinstance(end_text[index], dict):
            element.update(end_text[index])

        if isinstance(Categories[index], dict):
            element.update(Categories[index])
            
        if isinstance(Sections[index], dict):
            element.update(Sections[index])


        finished.append(element)
    return finished

#### Creates Excel document using Pandas and exports to desktop labeled "PDFtoExcel.xlsx"

In [9]:
import os

def toExcel(table, fileName):
    output_dir = r'C:\Users\Ben\Desktop\PDF_Testing'
    output_path = os.path.join(output_dir, f'{fileName}.xlsx')
    
    Excel = pd.DataFrame(table)
    pd.DataFrame.to_excel(Excel, output_path)
    
    return


#### Creates Source

In [71]:
#The fileName must have '_' between all of the words
def sourceName(fileName):
    name = fileName.rfind('\\')
    if fileName != -1:
        source = fileName[name + 1:]
#     words = fileName.split('_')
#     sourceWord = ''
#     for word in words:
#         if word.isdigit():
#             sourceWord += word
#         else:
#             sourceWord += word[0]
    source = {'Source': source}
    return source


In [72]:
source

'L_F_R_D_2016.pdf'

# MAIN

In [74]:
#PDF to test
PDF = r"C:\Users\Ben\Desktop\PDF_Testing\L_F_R_D_2016.pdf"
pattern = r'^\d+\.\d+'
#orgText = convert_pdf_to_txt(PDF)
text = orgText.replace("\n", " ")
#Tons of random symbols like ';._' that need to be removed for a cleaner loo
cleantext0 = symbolClean(text)
#Makes words simpiler, not sure if helpful but useful for ML later
#cleantext1 = lemmatization(cleantext0)
#Splits sentences into seperate strings
split = re.split(r'(?<=\.)[ \n]', cleantext0)
cleantext2 = shortparaRemove(split)
## Section covers categorization of strings
categories = categorize_strings(cleantext2)
## Section division
sections = Division(cleantext2, pattern)
#creates Source name
source = sourceName(PDF)
#Creating the list for excel upload
table = compile(source, cleantext2, categories, sections, orgText, cleantext0,cleantext2)
#Sending table to Excel
toExcel(table, 'Test33')


In [22]:
text



In [77]:
cleantext2

['LRFD Road Tunnel Design and Construction Guide Specifications First Edition, 2017 Publ.',
 'All rights reserved.',
 'Duplication is a violation of applicable law.',
 'Photo provided by Bijan Khaleghi, Washington State DOT.',
 '© 2017 by the American Association of State Highway and Transportation Officials.',
 'All rights reserved.',
 'Duplication is a violation of applicable law.',
 'ISBN: 978-1-56051-643-9 Pub Code: LRFDTUN-1 © 2017 by the American Association of State Highway and Transportation Officials.',
 'All rights reserved.',
 'Duplication is a violation of applicable law.',
 'C.',
 'REGIONAL REPRESENTATIVES: REGION I: Leslie Richards, Pennsylvania Pete Rahn, Maryland REGION II: Charles Kilpatrick, Virginia James Bass, Texas REGION III: Randall S.',
 'All rights reserved.',
 'Duplication is a violation of applicable law.',
 'JOHNSON, Vice Chair JOSEPH L.',
 'HARTMANN, Federal Highway Administration, Secretary PATRICIA J.',
 'BUSH, AASHTO Liaison ALABAMA, Eric J.',
 'Christie

In [79]:
cleantext2[870:900]


['See Article 2.8.10.',
 '\uf0b7 Structural connections and attachments shall be detailed in a manner that facilitates visual inspection and access to the connection or repairs or maintenance.',
 'attachment Components of connections and attachments shall be detailed to be easily replaced.',
 'for © 2017 by the American Association of State Highway and Transportation Officials.',
 'All rights reserved.',
 'Duplication is a violation of applicable law.',
 '2.9 REFERENCES 1.',
 'AASHTO.',
 'LRFD Bridge Design Specifications.',
 '7th ed.',
 'American Association of State Highway and Transportation Officials, Washington, DC, 2014.',
 '2.',
 'AASHTO.',
 'Technical Manual for Design of Road Tunnels—Civil Elements.',
 'American Association of State Highway and Transportation Officials, Washington, DC, 2010.',
 '3.',
 'AMCA.',
 'Publication 201, Fans and Systems.',
 'Air Movement and Control Association International, Inc., Arlington Heights, IL, 2007.',
 '4.',
 'ANSI/IES RP-8 – Standard Pract

In [80]:
sections[870:900]

[{'Section': '2.8.11.1—Communication'},
 {'Section': '2.8.11.1—Communication'},
 {'Section': '2.8.11.1—Communication'},
 {'Section': '2.8.11.1—Communication'},
 {'Section': '2.8.11.1—Communication'},
 {'Section': '2.8.11.1—Communication'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'},
 {'Section': '2.9'}]