In [1]:
import camelot
import re
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from datetime import date

In [2]:
sellerRegex = {'đơn vị bán hàng|đơn vị bán': 'sellerLegalName', 'mã số thuế|mst': 'sellerTaxCode'}
buyerRegex = {'tên đơn vị|đơn vị': 'buyerLegalName', 'mã số thuế|mst': 'buyerTaxCode'}
datespliter = ['-', '/']

In [3]:
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

In [4]:
################# GET SELLER INFORMATION ######################
def preprocessLegalName(name):
    return ' '.join(name.split())

def preprocessTaxCode(code):
#     code = code.replace(' ', '')
#     code = code.replace('\n', '')
    result = ''
    for i in range(len(code)):
        if code[i] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-']:
            result = result + code[i]

    return result

#check if the cell contains basic info (tax code)
def containBasicInfo(cell):
    basicRegex = ['mã số thuế|mst']
    for reg in basicRegex:
        if not re.search(reg, cell, re.IGNORECASE):
            return False
    return True

#check if the cell contains buyer info
def containBuyerInfo(cell):
    buyerRegex = ['tên đơn vị|đơn vị', 'mã số thuế|mst', 'hình thức thanh toán|httt']
    for reg in buyerRegex:
        if not re.search(reg, cell, re.IGNORECASE):
            return False
    return True
    
#check if the cell contains seller info    
def containSellerInfo(cell):
    if containBasicInfo(cell) and not containBuyerInfo(cell):
        return True
    return False

#get seller name in the cell
def getSellerLegalName(cell):
#     print(cell)  
    result = ''
    begin = 0
    end = len(cell)
    
    beginreg = ['đơn vị bán hàng|đơn vị bán']
    sellerreg = ['công ty', 'doanh nghiệp', 'tập đoàn', 'chi nhánh', 'tổng công ty']
    endreg = ['mã số thuế|mst', 'địa chỉ', 'điện thoại', 'website', 'số tài khoản|stk']
    otherreg = ['hóa đơn', 'giá trị', 'gia tăng', 'mẫu số', 'ký hiệu', 'số', 'liên', 'ngày', 'tháng', 'năm']
    
    for reg in beginreg:
        found = re.search(reg, cell[begin:], re.IGNORECASE)
        if found:
            begin = begin + found.end()
            colonfound = re.search(':', cell[begin:], re.IGNORECASE)
            if colonfound:
                begin = begin + colonfound.end()
            break
    
    tmp = end
    for reg in sellerreg:
        found = re.search(reg, cell[begin:end], re.IGNORECASE)
        if found:
            tmp = min(tmp, found.start())
    if tmp==end:
        begin = begin
    else:
        begin = begin + tmp

    for reg in endreg:
        found = re.search(reg, cell[begin:], re.IGNORECASE)
        if found:
            end = min(end, begin + found.start())
    
    for reg in otherreg:
        found = re.search(reg, cell[begin:], re.IGNORECASE)
        if found:
            end = min(end, begin + found.start())
            
    result = cell[begin:end].strip()
    return result

#get seller info in getSellerInfo function fail
def backupSellerInfo(text, seller):
    for key, engkey in sellerRegex.items():
        if len(seller.get(engkey))==0:
            return getSellerInfo(text)
    return seller

#get seller info in the cell
def getSellerInfo(cell):
    result = {}
    basic_regex = 'mã số thuế|mst'
    seller_regex = 'đơn vị bán hàng|đơn vị bán'
    buyer_regex = 'khách hàng|mua hàng|tên đơn vị|đơn vị'
    third_regex = 'cung cấp giải pháp hóa đơn điện tử|phát hành|bởi'
    all_regex = ['địa chỉ', 'mã số thuế|mst', 'điện thoại', 'website', 'số tài khoản|stk']
    
    start = 0
        
    while start<len(cell):
        # firstly, search for taxcode in the text
        taxcode_found = re.search(basic_regex, cell[start:], re.IGNORECASE)
        if taxcode_found:
            # ensure that it is seller information          
            if re.search(seller_regex, cell[start: start + taxcode_found.start()], re.IGNORECASE) \
            or (not re.search(buyer_regex, cell[start: start + taxcode_found.start()], re.IGNORECASE) \
            and not re.search(third_regex, cell[start: start + taxcode_found.start()], re.IGNORECASE)):
                # get the name and the taxcode
                for key, engkey in sellerRegex.items():
                    if key == seller_regex:
                        value = preprocessLegalName(getSellerLegalName(cell[start:start+taxcode_found.start()]))
                        result.update({engkey:value})
                    else:
                        begin = start + taxcode_found.end()
                        firstcolonfound = re.search(':', cell[begin:], re.IGNORECASE)
                        if firstcolonfound:
                            begin = begin + firstcolonfound.end()

                        secondcolonfound = re.search(':', cell[begin:], re.IGNORECASE)
                        end = len(cell)
                        if secondcolonfound:
                            end = begin + secondcolonfound.end()

                        for otherreg in all_regex:
                            if not otherreg == key:
                                actualend = re.search(otherreg, cell[begin:end], re.IGNORECASE)
                                if actualend:
                                    end = begin + actualend.start()
                                    break
                                else:
                                    end = end
                        value = cell[begin : end].strip()
                        if key == 'mã số thuế|mst':
                            value = preprocessTaxCode(value)
                        result.update({engkey:value})
                return result
            else:
                start = taxcode_found.end()
        else:
            break
            
    return result

In [5]:
################ MAIN PROCESS #####################

# get information from table
def getPartiesInfo(tables):
    seller = {}
    
    for t in range(len(tables)):
        table = tables[t].df
        m, n = table.shape
        for i in range(m):
            row = table.iloc[i,:].values

            for j in range(n):
                cell = row[j]
                if len(seller)==0 and containBasicInfo(cell):
#                     print('seller: ', i, ', ', j)
                    seller = getSellerInfo(cell)
                elif len(seller)>0:
                    return seller
     
    return seller

# get information from all tables                
def extract_from_pdf(inputfile):
    result = {}
    tables = camelot.read_pdf(inputfile, pages="1-end", flavor='lattice', process_background=True)
    text = convert(inputfile)
     
    seller = {}
    buyer = {}
    seller = getPartiesInfo(tables)    
    if len(seller)==0:
        seller = {'sellerLegalName': '', 'sellerTaxCode': ''}
    seller = backupSellerInfo(text, seller)  
    result.update(seller)
    
    return result

In [6]:
inputfile = 'Data/56.pdf'
print(extract_from_pdf(inputfile))

{'sellerLegalName': 'CÔNG TY TNHH ÁNH SÁNG TIẾN DƯ - CHI NHÁNH HỒ CHÍ MINH', 'sellerTaxCode': '0104349250-001'}
