In [1]:
import camelot
import re
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from datetime import date

In [32]:
################ GET COSTS #################
#convert cost string (with '.' and ',') to number
def costStringToNum(cost):
    result = 0
    i = 0
    front = 0.
    rear = 0.
    trace = -1
    
    while i<len(cost):
        if cost[i]==',':
            trace = 0
        elif cost[i]!='.':
            if trace<0:
                front = front*10 + int(cost[i])
            else:
                rear = rear*10 + int(cost[i])
                trace +=1
        i+=1
    result = front + rear / (10 ** trace)
    return result

#check if the cell contain a cost number
def containCost(cell):
    cost_regex = '\d{1,3}([.]\d{3})+([,]\d+)?'
    if re.search(cost_regex, cell):
        return True
    return False

#get all cost in the cell
def getCosts(cell):
    cost_regex = '\d{1,3}([.]\d{3})+([,]\d+)?'
    
    result = []
    
    begin = 0
    while 1:
        found = re.search(cost_regex, cell[begin:])
        if found:
            result.append(cell[begin+found.start():begin+found.end()])
            begin = begin + found.end()
        else:
            break
    return result        

#get all cost in the tables    
def getAllCosts(tables):
    result = []
    
    for i in range(len(tables)):
        table = tables[i].df
        m, n = table.shape
        for i in range(m):
            row = table.iloc[i,:].values
            for j in range(n):
                cell = row[j]
                if containCost(cell):
                    result.extend(getCosts(cell))
    return result

#get totalWithoutVAT, totalVAT, and totalWithVAT
def getFinalCosts(tables):
    finalCosts = {'totalWithoutVAT': '', 'totalVAT': '', 'totalWithVAT': ''}
    allCosts = getAllCosts(tables)
    
    if len(allCosts)<3:
        return finalCosts
    
    if len(allCosts)>1:
        if allCosts[-1]==allCosts[-2]:
            allCosts.append(allCosts[-1])
            allCosts[-2] = '0'        
    
    i = -3
    for reg in finalCosts.keys():
        finalCosts.update({reg: allCosts[i]})
        i += 1
    return finalCosts 

In [33]:
################ MAIN PROCESS #####################

# get information from table
def getPartiesInfo(tables):
    seller = {}
    buyer = {}
    
    for t in range(len(tables)):
        table = tables[t].df
        m, n = table.shape
        for i in range(m):
            row = table.iloc[i,:].values

            for j in range(n):
                cell = row[j]
                if len(seller)==0 and containSellerInfo(cell):
                    seller = getSellerInfo(cell)
                elif len(seller)>0 and len(buyer)>0:
                    return seller, buyer
     
    return seller, buyer

# get information from all tables                
def extract_from_pdf(inputfile):
    result = {}
    tables = camelot.read_pdf(inputfile, pages="1-end", flavor='lattice', process_background=True)
#     text = convert(inputfile)

    finalCosts = {}
    finalCosts = getFinalCosts(tables)

    result.update(finalCosts)
    
    return result

In [34]:
inputfile = 'Data/61.pdf'
print(extract_from_pdf(inputfile))

{'totalWithoutVAT': '22.550.000', 'totalVAT': '0', 'totalWithVAT': '22.550.000'}


In [9]:
tables = camelot.read_pdf(inputfile, pages="1-end", flavor='lattice', process_background=True)
print(tables)

<TableList n=1>


In [10]:
print(tables[0].df)

   0                                                  1  \
0                                                         
1                                                         
2                                                         
3                                                         
4                                                         
5                                                         
6                                                         
7     STT\nTên hàng hóa, dịch vụ\nĐơn vị tính\nSố lư...   
8                                                         
9                                                         
10                                                        
11                                                        
12                                                        

                                                    2 3 4  
0                                                          
1                                                    

In [15]:
print(getVat(tables[0].df.values[7][1]))

['0']
