# Extract data from PDF using regex

Although many pdf pasrer can scrap data from a pdf document but the output can be random, for example:

In [38]:
import pandas as pd
import numpy as pd
import re

## Example - Credit Bureau Report

![](pdf/creditform.png)

In [1]:
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text

In [4]:
text = convert('pdf/3A_e-Premium.pdf')
text = text.split('\n')
text = [line for line in text if line is not '']

In [6]:
text[8:30]

['Company Identification',
 'Company Name:',
 'Zhejiang Heavy Machinery & Equipment  Co., Ltd. (Sample Report)',
 'Company Name (CN): \xe6\xb5\x99\xe6\xb1\x9f\xe9\x87\x8d\xe5\xb7\xa5\xe6\x9c\xba\xe6\xa2\xb0\xe8\xae\xbe\xe5\xa4\x87\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\xe5\x8f\xb8 (\xe6\xa0\xb7\xe6\x9c\xac\xe6\x8a\xa5\xe5\x91\x8a)',
 'No. 67, Xinggang Road',
 'Science & Technology Tower 1',
 'Phone:',
 '+86-0579-67525900,',
 '67525908',
 'Facsimile:',
 '+86-0579-67525902',
 'E-mail:',
 'Website:',
 'Zip Code:',
 'Organizational',
 'Code:',
 'Tax Nr:',
 'info@beijinghme.com',
 'www.beijinghme.com',
 '320513',
 '667179359',
 '667581667179359']

> The most ideal output will be the field is follow by the field name, e.g. 'Company Name:', 'Zhejiang Heavy Machinery...'. However, other fields suck as 'E-mail:' and 'Website:' are not follow by 'info@beijinghme.com' and 'www.beijinghme.com'.

> Without using the field name, we need to use the filed characteristics, e.g.

In [34]:
website = [line for line in text if 'www' in line]
email = [line for line in text if '@' in line]
tel = [line for line in text if '+86' in line]
print 'website:', website, '\n','email:', email 
print '\n', 'tel:', tel[:2]

website: ['www.beijinghme.com'] 
email: ['info@beijinghme.com']

tel: ['+86-0579-67525900,', '+86-0579-67525902']


## Using Regular Expression

In [57]:
def extract_paragraph(text, keyword1, keyword2, remove_empty = False):
    paragraph = []
    for line in range(len(text)):
        if any(w in text[line] for w in keyword1):
            paragraph.append(text[line])
            nl = 1
            while any(w in text[line+nl] for w in keyword2) ==  False:
                if (line + nl + 1) >= len(text):
                    paragraph.append(text[line+nl])
                    break
                else:
                    paragraph.append(text[line+nl])
                    nl += 1
            break
    paragraph.append('THE END')
    if remove_empty == True:
        paragraph = [line for line in paragraph if line is not '']
    return paragraph

def regex_find(expression, text):
    x = []
    for line in text: 
        match = re.findall('.*?(' + expression + ').*', line)
        if len(match) != 0:     
            x.append(match)
    return x
    

In [58]:
# find the date format
regex_find('[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]', text)[:5]


[['2014-03-06'],
 ['2014-03-06'],
 ['2006-04-01'],
 ['2014-03-06'],
 ['2006-04-01']]

> There are many dates in this report!

In [59]:
regex_find('[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]', 
           extract_paragraph(text, ['Legal Information'], 
                            ['Business Scope']))

[['2006-04-01'], ['2056-03-31']]

> This limited to the 'Incorporation Date' and 'Expiration Date'