## Extracting relevant information from PDF files using OCR and NLP techniques

In [21]:
import os
import spacy
import re
from dateutil.parser import *
import textract
import json

In [3]:
nlp = spacy.load('en_core_web_md')

In [4]:
os.chdir(r'C:\Users\amitr\Desktop\Sample_Files')

pdf1 = '09f8cfb0-d649-0137.pdf'
pdf2 = '202076540912.pdf'
pdf3 = '201051107425035.pdf'
pdf4 = 'BD1-1421.pdf'
pdf5 = 'INV_UCB_19847612.pdf'
pdf6 = 'ISMP23497211.pdf'

all_pdfs = [pdf1,pdf2,pdf3,pdf4, pdf5,pdf6]

#### Function to check whether it's an Invoice or Engineering drawing

In [5]:
def Is_Invoice_Or_Drawing(text):
    
    tokens = text.lower().split()
    
    valid_invoice_tags = ['invoice', 'lading', 'bill'] 
    
    output = [i for i in tokens if i in valid_invoice_tags]
    
    if len(output) > 0:
        return "Invoice"
    else:
        return "Engineering Drawing"


#### Text Cleaning

In [6]:
def Clean_Text(file):
    
    text = textract.process(file, method='tesseract')

    # Convert to string
    text = text.decode('utf-8')
    
    # Replace "\r\n" with spaces
    text = text.replace("\r\n", " ")
    
    # Remove any double spaces
    text = re.sub(" +", " ", text)
    
    return text

In [7]:
def Get_Tag_Position(tag, text, direction = 'forward'):
        
    tag = tag.lower()
    
    if direction == 'reverse':
        tag_pos = text.lower().rfind(tag)
    else:
        tag_pos = text.lower().find(tag)

    if tag_pos > 0:
        start_pos = tag_pos + len(tag) + 1
    else:
        start_pos = -1
    
    return start_pos

In [8]:
def Get_Invoice_Date(text):
    
    possible_dates = []

    # Search for the exact tag mentioned as "Invoice Date"
    temp_corpus = text[Get_Tag_Position("invoice date", text):]
    doc = nlp(temp_corpus)
    possible_dates = [ent for ent in doc.ents if ent.label_ == "DATE"]

    
    # If the tag "Invoice Date" is not found then search for the tag "Date"
    temp_corpus = text[Get_Tag_Position("date", text):]
    doc = nlp(temp_corpus)
    possible_dates += [ent for ent in doc.ents if ent.label_ == "DATE"]

    
    # If both the above fail then loop through all the tokens and try to identify valid dates
    tokens = text.split()

    for token in tokens:

        if len(token) >= 4:
            try:
                converted_date = parse(token)
                possible_dates.append(token)
            except:
                continue

    # Convert all the date tokens found to string           
    possible_dates = list(map(str, possible_dates))        
    

    # Select only the final valid dates
    valid_dates = []

    for date in possible_dates:

        if len(date) >= 4:
            try:
                converted_date = parse(date)
                valid_dates.append(date)
            except:
                continue

    if len(valid_dates) > 0:
        return valid_dates[0]
    else:
        return "No Invoice Date Found"

#### Get Invoice Amount

In [9]:
def Parse_Amounts(text):
    
    money = re.compile('|'.join([
                    r"\$?\d+\.?\d+\,\d{1,2} ",  # EUR format
                    r"\$?\d+\,?\d+\.\d{1,2} ",  # USD format
                    r"\$\d+\,?\d*\.?\d{1,2} ",  # USD format without decimals
                   ]))
    
    matches = re.findall(money, text)

    matches = [i.strip() for i in matches]

    return matches

In [10]:
def Get_Invoice_Amount(text):
    
    text = text.lower()

    invoice_amount = ""

    amounts = Parse_Amounts(text)

    if len(amounts) > 0:
        invoice_amount = amounts[-1]
    else:

        temp_corpus = text[Get_Tag_Position("total", text, direction='reverse'):]
        numbers = re.findall("\d+", temp_corpus)

        if len(numbers) > 0:
            invoice_amount = numbers[0]

    return invoice_amount

### Parse Engineering Drawing data

In [11]:
# Search for the exact tag mentioned as "job"

def Get_JobID(text):

    start_pos = Get_Tag_Position("job", text)
    
    if start_pos < 0:
        job_id = "No job ID found"
    else:
        
        temp_corpus = text[start_pos:]

        valid_matches = []

        tokens = temp_corpus.split()[:10]

        for token in tokens:
            if token.isnumeric():
                valid_matches.append(token)

        if len(valid_matches) > 0:
            job_id = valid_matches[0]
        else:
            job_id = "No job ID found"

    return job_id

In [12]:
# Search for the exact tag mentioned as "pipe"

def Get_Pipe_Classes(text):
    
    text = text.lower()
    text = text.replace("|", " ")
    tokens = text.lower().split()

    valid_matches = []
    for i, token in enumerate(tokens):
        if (token == "pipe") :
            next_token = tokens[i+1]
            if next_token.isalpha():
                valid_matches.append(next_token.upper())

    if len(valid_matches) > 0:
        pipe_classes = valid_matches
    else:
        pipe_classes = "No pipe class found"

    return pipe_classes

### Get Invoice Number

In [13]:
def Extract_Tag_Data(tag, text, direction = 'forward'):
    
    token = ''
    
    start_pos = Get_Tag_Position(tag, text, direction=direction)

    if start_pos > 0:
        temp_corpus = text[start_pos:]
        token = temp_corpus.split('|')[0]
        
    return token

In [14]:
def Get_Invoice_Number(file):

    # Preprocess the text. Using pipe delimited format as the invoice number has spaces in some cases
    #------------------------------------------------------------------------------------------------
    
    text = textract.process(file, method='tesseract')
    text = text.decode('utf-8')

    text = text.replace("\r\n", "|")
    text = re.sub(" +", " ", text)

    tokens = [i for i in text.split('|') if len(i) > 1]
    text = '|'.join(tokens)

    text = text.lower()
    text = re.sub(" number:| no\.| n\.", " number", text)


    # Extract the Invoice Number
    #---------------------------
    valid_matches = []

    keywords = ["invoice number", "lading number", "invoice", "lading", "number"]

    for keyword in keywords:

        token = Extract_Tag_Data(keyword, text, 'reverse')

        if len(token) > 0:
            valid_matches.append(token)

    valid_matches = [i.upper() for i in valid_matches]

    if len(valid_matches) > 0:
        invoice_number = valid_matches[0]
        if len(invoice_number) > 20:
            invoice_number = invoice_number.split('/')[0]
    else:
        invoice_number = "No Invoice Number found"
    
    return invoice_number

## Packaging it all together

In [18]:
for file in all_pdfs:

    print("Processing File :", file)
    
    file_text = Clean_Text(file)
    
    if Is_Invoice_Or_Drawing(file_text) == "Invoice":
        
        Invoice_Number = Get_Invoice_Number(file)
        Invoice_Date = Get_Invoice_Date(file_text)
        Invoice_Amount = Get_Invoice_Amount(file_text)

        print("Document Type : Invoice")
        print("Invoice Number :", Invoice_Number)
        print("Invoice Date :", Invoice_Date)
        print("Invoice Amount :", Invoice_Amount)
        print("\n")
    
    else:
        
        Job_ID = Get_JobID(file_text)
        Pipe_Class = Get_Pipe_Classes(file_text)
        
        print("Document Type : Engineering Drawing")
        print("Job ID :", Job_ID)
        print("Pipe Classes :", Pipe_Class)
        print("\n")

Processing File : 09f8cfb0-d649-0137.pdf
Document Type : Invoice
Invoice Number : CMDU SZ. 023 7/220
Invoice Date : 2019
Invoice Amount : 19.59


Processing File : 202076540912.pdf
Document Type : Invoice
Invoice Number : 8555691 
Invoice Date : 03.01.2019
Invoice Amount : 21.153,60


Processing File : 201051107425035.pdf
Document Type : Invoice
Invoice Number : TERRELL RICHEMOND VIVAMUS IN FELIS EU SAPIEN CURSUS
Invoice Date : November 6, 1971
Invoice Amount : 4618


Processing File : BD1-1421.pdf
Document Type : Engineering Drawing
Job ID : 14216
Pipe Classes : ['SMHLS', 'SMLS']


Processing File : INV_UCB_19847612.pdf
Document Type : Invoice
Invoice Number : 90000178
Invoice Date : 08 January 2019
Invoice Amount : 38.707,27


Processing File : ISMP23497211.pdf
Document Type : Invoice
Invoice Number : INV-3337
Invoice Date : January 25, 2016
Invoice Amount : $93.50




## Creating the final JSON output

In [45]:
file = '201051107425035.pdf'

In [46]:
file_name = file.split('.')[0]

file_text = Clean_Text(file)

data = {}

if Is_Invoice_Or_Drawing(file_text) == "Invoice":
    
    data['Document Type']  = "Invoice"
    data['Invoice Number'] = Get_Invoice_Number(file)
    data['Invoice Date']   = Get_Invoice_Date(file_text)
    data['Invoice Amount'] = Get_Invoice_Amount(file_text)
    
else:

    data['Document Type']  = "Engineering Drawing"
    data['Job Number'] = Get_JobID(file_text)
    data['Pipe Class'] = Get_Pipe_Classes(file_text)
    
with open(file_name + '.json', 'w') as f:
    json.dump(data, f)