In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
model_ner = spacy.load('./output/model-best/')

In [6]:
image = cv2.imread('./data/000.jpg')

tessData = pytesseract.image_to_data(image)

tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
df.dropna(inplace=True) 
df['text'] = df['text'].apply(cleanText)


df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])
print(content)

doc = model_ner(content)

corporate packaging promotions bill of lading to consignee — sams dist.ctr 6493 street 1000 s. cucamonga ave bill of lading number 8175105 po bill date from ar50 corporate packaging shipper destination city/state/zip ontario, ca 91761 third party billing, send to name arizona beverages llc c ompany arizona beverages llc street city/state/zip street 1555 s. archibald avenue origin city/state/zip ontario, ca 91761 special instructions —t shippers instructions no. shipping cases no. shipping pallets description of articles special marks exceptions .-y. unni equip d 317 7 equip arivat 12/20/19 0459 carrier bawb ° seal 3289785 remit c.0.d. address reseal door/zone del date appointment 12/20/19 0500 have read and understan appointment / drop rules note w here the rate is depende on value, shippers are required to state specifically in writing the agre or declared value of the property. 7 agreed or declared value of the driver signature property is hereby specifically state by the shipper to 

In [7]:
from spacy import displacy

In [8]:
displacy.serve(doc,style='ent')


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [9]:
displacy.render(doc,style='ent')

In [10]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [11]:
doc_text = docjson['text']

In [12]:
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]] , axis = 1)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,9,corporate
1,1,10,19,packaging
2,2,20,30,promotions
3,3,31,35,bill
4,4,36,38,of
5,5,39,45,lading
6,6,46,48,to
7,7,49,58,consignee
8,8,59,60,—
9,9,61,65,sams


In [13]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')

In [14]:
datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,9,corporate,O
1,1,10,19,packaging,O
2,2,20,30,promotions,O
3,3,31,35,bill,O
4,4,36,38,of,O
5,5,39,45,lading,O
6,6,46,48,to,O
7,7,49,58,consignee,O
8,8,59,60,—,O
9,9,61,65,sams,B-CUSTOMER_NAME


In [15]:
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1 
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)

In [16]:
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')

In [17]:
dataframe_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
275,5,1,52,1,6,5,1358,1766,17,17,96,of,1688,1686,of,O
276,5,1,52,1,6,6,1382,1767,63,23,91,lading,1695,1689,lading,O
277,5,1,52,1,6,7,1454,1771,54,18,95,terms,1701,1696,terms,O
278,5,1,52,1,7,1,1226,1787,21,16,96,ith,1705,1702,ith,O
279,5,1,52,1,7,2,1257,1789,19,15,90,all,1709,1706,all,O
280,5,1,52,1,7,3,1285,1789,28,17,96,the,1713,1710,the,O
281,5,1,52,1,7,4,1323,1790,25,17,96,bill,1718,1714,bill,O
282,5,1,52,1,7,5,1357,1792,16,16,96,of,1721,1719,of,O
283,5,1,52,1,7,6,1381,1793,63,23,92,lading,1728,1722,lading,O
284,5,1,52,1,7,7,1454,1797,54,17,96,terms,1734,1729,terms,O


In [18]:
bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [19]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
9,5,1,3,1,2,3,280,439,77,21,93,sams,65,61,sams,CUSTOMER_NAME
10,5,1,3,1,2,4,367,440,117,21,91,dist.ctr,74,66,dist.ctr,CUSTOMER_NAME
11,5,1,3,1,2,5,493,441,71,20,96,6493,79,75,6493,CUSTOMER_NAME
12,5,1,4,1,1,1,92,471,82,37,97,street,86,80,street,CUSTOMER_ADDRESS
13,5,1,4,1,1,2,300,480,53,21,93,1000,91,87,1000,CUSTOMER_ADDRESS


In [20]:
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
        
grp_gen = groupgen()

In [21]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [22]:
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

In [23]:
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')

In [24]:
img_tagging = group_tag_img.agg({
    
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x: " ".join(x)
    
})

In [25]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,280,564,439,461,[CUSTOMER_NAME],sams dist.ctr 6493
2,92,626,471,508,[CUSTOMER_ADDRESS],street 1000 s. cucamonga ave
3,846,1366,315,346,[INVOICE_NO],bill of lading number 8175105
4,997,1417,448,476,[COMPANY_NAME],ar50 corporate packaging
5,425,536,561,584,[CITY],ca 91761
6,825,1409,482,519,[COMPANY_ADDRESS],street 1555 s. archibald avenue
7,840,1293,569,599,[CITY],city ca 91761


In [26]:
img_bb = image.copy()
for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb,(l,t),(r,b),(0,255,0),2)
    cv2.putText(img_bb,str(label),(l,t),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
cv2.imshow('',img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [41]:
def parser(text,label):
    if label=='CUSTOMER_NAME':
        text = text.lower()
        allow_special_char='.#_'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char),'',text)
        
    elif label=='CUSTOMER_ADDRESS':
        text = text.lower()
        allow_special_char='@.,/\-_'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char),'',text)
        
    elif label=='COMPANY_ADDRESS':
        text = text.lower()
        allow_special_char='@.,/\-_'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char),'',text)
        
    elif label=='INVOICE_NO':
        text = text.lower()
        text = re.sub(r'[\D]','',text)
        
    elif label=='CITY':
        text = text.lower()
        allow_special_char='.#_'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char),'',text)
        
    elif label=='COMPANY_NAME':
        text = text.lower()
        allow_special_char='.#_'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char),'',text)
        
    return text
        

In [42]:
parser('bill of lading number 8175105','INVOICE_NO')

'8175105'

In [43]:
dataframe_info[['token','label']]

Unnamed: 0,token,label
0,corporate,O
1,packaging,O
2,promotions,O
3,bill,O
4,of,O
...,...,...
280,the,O
281,bill,O
282,of,O
283,lading,O


In [47]:
info_array = dataframe_info[['token','label']].values
entities = dict(INVOICE_NO=[],CUSTOMER_NAME=[],CUSTOMER_ADDRESS=[],COMPANY_NAME=[],COMPANY_ADDRESS=[],CITY=[])
previous = 'O'

for token,label in info_array:
    #print(token,label)
    bio_tag = label[0]
    label_tag = label[2:]
    text = parser(token,label_tag)
    if bio_tag in ('B','I'):
        if previous != label_tag:
            entities[label_tag].append(text)
        else:
            if bio_tag=="B":
                entities[label_tag].append(text)
            else:
                if label_tag in ('INVOICE_NO','CUSTOMER_NAME','CUSTOMER_ADDRESS','COMPANY_NAME','COMPANY_ADDRESS','CITY'):
                    entities[label_tag][-1] = entities[label_tag][-1]+' '+text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1]+text
                    
    previous = label_tag
    

In [46]:
entities

{'INVOICE_NO': ['    8175105'],
 'CUSTOMER_NAME': ['sams dist.ctr 6493'],
 'CUSTOMER_ADDRESS': ['street 1000 s. cucamonga ave'],
 'COMPANY_NAME': ['ar50 corporate packaging'],
 'COMPANY_ADDRESS': ['street 1555 s. archibald avenue'],
 'CITY': ['ca 91761', 'city', 'ca 91761']}