## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

### Clean Text

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('', '', whitespace)
    tablePunctuation = str.maketrans('','', punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    return str(removepunctuation)

## Load NER Model

In [3]:
model_ner = spacy.load('E:/FILE of Trong/NLP Project/ner/outputs/model-best')

### Load Image

In [4]:
image = cv2.imread('E:/FILE of Trong/NLP Project/ner/datasets/6.jpg')
# cv2.imshow("BussinessCard", image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

### Extract text using Pytesseract

In [10]:
tessData = pytesseract.image_to_data(image)
#Convert to DataFrame
tessList = list(map(lambda x: x.split("\t"), tessData.split('\n')))
df = pd.DataFrame(tessList[1:], columns=tessList[0])
df.dropna(inplace=True)
df['text'] = df['text'].apply(cleanText)

#Convert Data into content
df_clean = df.query('text != ""')
content = " ".join([w for w in df_clean['text']])
print(content)

cell 8099948528 te 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail .com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad - 500 016. add. borabanda, hyderabad - 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo


### Get Predictions from the Model

In [12]:
from spacy import displacy
doc = model_ner(content)
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### Taging

In [13]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [14]:
doc_text = docjson['text']

In [20]:
doc_text

'cell 8099948528 te 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail .com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad - 500 016. add. borabanda, hyderabad - 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo'

In [17]:
dataframe_tokens = pd.DataFrame(docjson['tokens'])
dataframe_tokens['token'] = dataframe_tokens[['start', 'end']].apply(lambda x: docjson['text'][x[0]:x[1]], axis=1)
dataframe_tokens.head(10)

  dataframe_tokens['token'] = dataframe_tokens[['start', 'end']].apply(lambda x: docjson['text'][x[0]:x[1]], axis=1)


Unnamed: 0,id,start,end,token
0,0,0,4,cell
1,1,5,15,8099948528
2,2,16,18,te
3,3,19,29,8466045457
4,4,30,35,email
5,5,36,57,lictsrikant@gmail.com
6,6,58,62,life
7,7,63,72,insurance
8,8,73,84,corporation
9,9,85,87,of


In [22]:
right_table = pd.DataFrame(docjson['ents'])[['start', 'label']]
dataframe_tokens = pd.merge(dataframe_tokens, right_table, how='left', on='start')

In [23]:
dataframe_tokens.fillna(value='O', inplace=True)
dataframe_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,4,cell,O
1,1,5,15,8099948528,B-PHONE
2,2,16,18,te,O
3,3,19,29,8466045457,B-PHONE
4,4,30,35,email,O
5,5,36,57,lictsrikant@gmail.com,B-EMAIL
6,6,58,62,life,B-ORG
7,7,63,72,insurance,I-ORG
8,8,73,84,corporation,I-ORG
9,9,85,87,of,I-ORG


### Join label to df_clean dataframe

In [33]:
df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1
df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1
  df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis = 1)


In [46]:
dataframe_info = pd.merge(df_clean, dataframe_tokens[['start', 'end','token','label']], how='inner', on ='start')

In [47]:
dataframe_info.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end_x,start,end_y,token,label
0,5,1,3,1,1,1,722,53,64,28,90,cell,4,0,4,cell,O
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,15,8099948528,B-PHONE
2,5,1,3,2,1,1,55,55,85,89,30,te,18,16,18,te,O
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,29,8466045457,B-PHONE
4,5,1,3,2,2,1,593,136,93,25,89,email,35,30,35,email,O
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,57,lictsrikant@gmail.com,B-EMAIL
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,62,life,B-ORG
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,72,insurance,I-ORG
8,5,1,3,3,1,3,183,170,117,14,96,corporation,84,73,84,corporation,I-ORG
9,5,1,3,3,1,4,306,170,20,14,96,of,87,85,87,of,I-ORG


### Bounding Box

In [48]:
bb_df = dataframe_info.query("label != 'O'")
img = image.copy()
for x, y, w, h, label in bb_df[['left', 'top', 'width', 'height', 'label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
    cv2.putText(img, str(label),(x,y), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 255), 2)

cv2.imshow('image', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [49]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end_x,start,end_y,token,label
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,15,8099948528,PHONE
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,29,8466045457,PHONE
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,57,lictsrikant@gmail.com,EMAIL
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,62,life,ORG
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,72,insurance,ORG
8,5,1,3,3,1,3,183,170,117,14,96,corporation,84,73,84,corporation,ORG
9,5,1,3,3,1,4,306,170,20,14,96,of,87,85,87,of,ORG
10,5,1,3,3,1,5,332,170,42,14,96,india,93,88,93,india,ORG
11,5,1,3,3,1,6,668,166,119,37,92,seosrika,102,94,102,seosrika,EMAIL
12,5,1,3,3,1,7,791,165,162,44,91,ntht@gmail,113,103,113,ntht@gmail,EMAIL


In [50]:
class Groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''

    def getgroup(self, text):
        if self.text == text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id

grp_gen = Groupgen()

In [51]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)


In [52]:
bb_df.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end_x,start,end_y,token,label,group
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,15,8099948528,PHONE,1
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,29,8466045457,PHONE,1
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,57,lictsrikant@gmail.com,EMAIL,2
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,62,life,ORG,3
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,72,insurance,ORG,3
8,5,1,3,3,1,3,183,170,117,14,96,corporation,84,73,84,corporation,ORG,3
9,5,1,3,3,1,4,306,170,20,14,96,of,87,85,87,of,ORG,3
10,5,1,3,3,1,5,332,170,42,14,96,india,93,88,93,india,ORG,3
11,5,1,3,3,1,6,668,166,119,37,92,seosrika,102,94,102,seosrika,EMAIL,4
12,5,1,3,3,1,7,791,165,162,44,91,ntht@gmail,113,103,113,ntht@gmail,EMAIL,4


### Right and Bottom of bounding box

In [53]:
bb_df[['left', 'top', 'width', 'height']] = bb_df[['left', 'top', 'width', 'height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df[['left', 'top', 'width', 'height']] = bb_df[['left', 'top', 'width', 'height']].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['right'] = bb_df['left'] + bb_df['width']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['bottom'] = bb_df['top'] + bb_df['height']


### Tagging: Groupby group

In [54]:
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end_x,start,end_y,token,label,group,right,bottom
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,15,8099948528,PHONE,1,1025,81
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,29,8466045457,PHONE,1,1025,136
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,57,lictsrikant@gmail.com,EMAIL,2,1025,167
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,62,life,ORG,3,79,184
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,72,insurance,ORG,3,176,193


In [55]:
col_group = ['left', 'top', 'right', 'bottom', 'label', 'token', 'group']
group_tag_img = bb_df[col_group].groupby(by = 'group')

In [57]:
img_tagging = group_tag_img.agg({
    'left': min,
    'right':max,
    'top': min,
    'bottom':max,
    'label':np.unique,
    'token': lambda x: ' '.join(x)
})

  img_tagging = group_tag_img.agg({
  img_tagging = group_tag_img.agg({


In [58]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,822,1025,53,136,[PHONE],8099948528 8466045457
2,709,1025,136,167,[EMAIL],lictsrikant@gmail.com
3,46,374,151,193,[ORG],life insurance corporation of india
4,668,1024,165,209,[EMAIL],seosrika ntht@gmail .com
5,310,755,227,259,[NAME],thathineni srikanth
6,399,669,271,296,[DES],insurance advisor
7,47,882,395,427,[ORG],life insurance corporation of india
8,46,476,506,533,[EMAIL],lictsrikant8099948528.blogspot.in


In [75]:
img_bb = image.copy()
for l, r, t, b, label, token in img_tagging.values:
    cv2.rectangle(img_bb, (l, t), (r, b), (0, 255, 0), 2)
    cv2.putText(img_bb, str(label), (l,t), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,255), 2)

cv2.imshow('img_bb', img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

### Parser

In [62]:
def parser(text, label):
    if label == 'PHONE':
        text = text.lower()
        text = re.sub(r'\D','', text)
    elif label == 'EMAIL':
        text = text.lower()
        allow_special_char = '@_.\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'', text)
    elif label == 'WEB':
        text = text.lower()
        allow_special_char = ':/.%#\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'', text)
    elif label in ('NAME', 'DES'):
        text = text.lower()
        text = re.sub(r'[^a-z ]','', text)
        text = text.title()
    elif label == 'ORG':
        text = text.lower()
        text = re.sub(r'[^a-z0-9 ]','', text)
        text = text.title()
    return text

In [65]:
parser('HOANGCONGTRONG02.DUT@#4$GMAIL.COM', 'EMAIL')

'hoangcongtrong02.dut@4gmail.com'

### Entities    

In [80]:
info_array = dataframe_info[['token', 'label']].values
entities = dict(NAME = [], ORG = [], DES = [],PHONE = [], EMAIL = [], WEB = [])
previous = 'O'

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]
    # print(bio_tag)
    # print(label_tag)
    # parser the token
    text = parser(token, label_tag)

    if bio_tag in ('B', 'I'):
        if previous != label_tag:
            entities[label_tag].append(text)
        else:
            if bio_tag == 'B':
                entities[label_tag].append(text)
            else:
                if label_tag in ("NAME", "ORG", "DES"):
                    entities[label_tag][-1] = entities[label_tag][-1] + ' ' + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text
    previous = label_tag


In [81]:
entities

{'NAME': ['Thathineni Srikanth'],
 'ORG': ['Life Insurance Corporation Of India',
  'Life Insurance Corporation Of India'],
 'DES': ['Insurance Advisor'],
 'PHONE': ['8099948528', '8466045457'],
 'EMAIL': ['lictsrikant@gmail.com',
  'seosrikantht@gmail.com',
  'lictsrikant8099948528.blogspot.in'],
 'WEB': []}