# Extract Words from PDFs with Python

Inspired by [this medium article](http://bit.ly/2DiKRqH) by Rizwan Qaiser

In [1]:
#imports
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download as nltk_download

In [2]:
nltk_download('punkt')
nltk_download('stopwords')

[nltk_data] Downloading package punkt to /Users/geodev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/geodev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def extract_info_from_pdf(filename):
    '''
    Function to read a pdf tile, extract text where available and run OCR where no text
    Type print(text) to see what it contains.
    It likely contains a lot of spaces, possibly junk such as '\n' etc.
    
    args:
        filename (path to file and filename)
    returns: 
        text variable contains all the text derived from PDF file.
    '''
    #with open(filename, 'rb') as f:
    #    pdfData = f.read()

    pdfData = open(filename,'rb')
    
    # file data
    pdfReader = PyPDF2.PdfFileReader(pdfData)
    num_pages = pdfReader.numPages
    text = ''

    # read each page
    for i in range(num_pages):
        pageObj = pdfReader.getPage(i)
        text += pageObj.extractText()

    # check for words, PyPDF2 cannot read scanned files, 
    # run OCR library textract to convert scanned/image based PDF files into text
    if text != '':
        text = text
    else:
        #text = textract.process(pdfData, method='tesseract', language='eng')
        text = PyPDF2.extractText().encode('utf-8')
        
    return text


In [4]:
file_path = '../data/2005_Sleipner_Hugin_Petrophysical_evaluation.pdf'
extract_info_from_pdf(file_path)

"  1 Sleipner \nØst \nand Volve\n Model 2006\n Hugin and Skagerrak Formation\n Petrophysic\nal Evaluation\n          2  Title::\n  Sleipner \nØst\n and Volve\n Model 2006\n Hugin and Skagerrak Formation\n Petrophysic\nal Evaluation\n     Document no.:\n Contract no./project no.:\n Filing no.:\n 3781-06        Classification:\n Distribution:\n Statoil Internal\n Corporate Statoil\n  Distribution date:\n Rev. date:\n Rev. no.:\n Copy no.:\n 2006-11-10 2006-10-24   1 17  Author(s)/Source(s):\n Elin Solfjell, \n Karl Audu\nn Lehne\n Subjects:\n Petrophysical Evaluation, Petrophysical Database, Sleipner \nØst Hugi\nn \nand Skagerrak \nFormation,\n Volve Hugin\n, Sleipner \nand Skagerrak \nFormation, \nReservoir M\nodel 2006\n Remarks:\n  Valid from:\n Updated:\n 2006-11-10  Responsible publisher:\n Aut\nhority to approve deviations:\n SVG PTEC\n   Techn. responsible:\n Name:\n Date/Signature:\n SVG PTE\nC Elin Solfjell\n Petrophysicist\n  Karl Audun Lehne\n Petrophysicist\n  Recommended:\n 

In [31]:
def clean_text(text):
    '''
    function that takes text from pdf and cleans up
    args:
        text (str) object
    returns:
        keywords
    '''
    # word_tokenize() to break text phrases into individual words
    tokens = word_tokenize(text)
    # create a new list which contains punctuation we wish to clean
    punctuations = ['(',')',';',':','[',']',',','...']
    #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
    stop_words = stopwords.words('english')
    #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
    keywords = [word for word in tokens if not word in stop_words and not word in punctuations]

    return keywords

In [32]:
clean_text(extract_info_from_pdf('../data/2005_Sleipner_Hugin_Petrophysical_evaluation.pdf'))

['1',
 'Sleipner',
 'Øst',
 'Volve',
 'Model',
 '2006',
 'Hugin',
 'Skagerrak',
 'Formation',
 'Petrophysic',
 'al',
 'Evaluation',
 '2',
 'Title',
 'Sleipner',
 'Øst',
 'Volve',
 'Model',
 '2006',
 'Hugin',
 'Skagerrak',
 'Formation',
 'Petrophysic',
 'al',
 'Evaluation',
 'Document',
 '.',
 'Contract',
 'no./project',
 '.',
 'Filing',
 '.',
 '3781-06',
 'Classification',
 'Distribution',
 'Statoil',
 'Internal',
 'Corporate',
 'Statoil',
 'Distribution',
 'date',
 'Rev',
 '.',
 'date',
 'Rev',
 '.',
 '.',
 'Copy',
 '.',
 '2006-11-10',
 '2006-10-24',
 '1',
 '17',
 'Author',
 '/Source',
 'Elin',
 'Solfjell',
 'Karl',
 'Audu',
 'n',
 'Lehne',
 'Subjects',
 'Petrophysical',
 'Evaluation',
 'Petrophysical',
 'Database',
 'Sleipner',
 'Øst',
 'Hugi',
 'n',
 'Skagerrak',
 'Formation',
 'Volve',
 'Hugin',
 'Sleipner',
 'Skagerrak',
 'Formation',
 'Reservoir',
 'M',
 'odel',
 '2006',
 'Remarks',
 'Valid',
 'Updated',
 '2006-11-10',
 'Responsible',
 'publisher',
 'Aut',
 'hority',
 'approve',
