In [18]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
from spacy import displacy
import re
import string

# Cleansing

In [2]:
# help function to clean up you text
def cleanText(text: str) -> str:
    """
    Designed for:
    -------------
    removement of  whitespaces and special characters from source text
    Input:
    ------
    str to preprocess
    Output:
    -------
    clean str
    """
    whitespace = string.whitespace
    punctuation = '!#$%&\'()*+/:;<=>?[\\]^`{|}~'
    table_whitespace = str.maketrans('', '', whitespace)
    table_puctuation = str.maketrans('', '', punctuation)
    text = str(text)
    text = text.lower()
    removeWhitespace = text.translate(table_whitespace)
    removePunctuation = removeWhitespace.translate(table_puctuation)
    return str(removePunctuation)

In [3]:
# load NER model 
modelNER = spacy.load('./output/model-best/')

In [20]:
# load image
img = cv2.imread('./data/6.jpg')
#cv2.imshow(img)
#cv2.waitKey(1)
#cv2.destroyAllWindows()

# extract data with pytesseract
rawData = pytesseract.image_to_data(img)
# convert to a pd dataframe
rawList = list(map(lambda x: x.split('\t'), rawData.split('\n')))
df = pd.DataFrame(rawList[1:], columns=rawList[0])
df.dropna(inplace=True)
df['text'] = df['text'].apply(cleanText)
df_clean = df.query('text != "" ')
content = " ".join([word for word in df_clean['text']])

# get predictions
doc = modelNER(content)
displacy.serve(doc, style="ent", auto_select_port=True)





Using the 'ent' visualizer
Serving on http://0.0.0.0:5001 ...



127.0.0.1 - - [05/Dec/2023 22:18:43] "GET / HTTP/1.1" 200 6203
127.0.0.1 - - [05/Dec/2023 22:18:43] "GET /favicon.ico HTTP/1.1" 200 6203


Shutting down server on port 5001.


In [22]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [41]:
doc_text = docjson['text']
data_tokens = pd.DataFrame(docjson['tokens'])
data_tokens['text'] = data_tokens[['start', 'end']].apply(
    lambda x: doc_text[x[0]:x[1]], axis=1)

  lambda x: doc_text[x[0]:x[1]], axis=1)


In [42]:
data_tokens.head()

Unnamed: 0,id,start,end,text
0,0,0,4,cell
1,1,5,15,8099948528
2,2,16,18,ga
3,3,19,29,8466045457
4,4,30,35,email


In [46]:
dataframe_tokens = pd.merge(data_tokens, 
              pd.DataFrame(docjson['ents'])[['start', 'label']], 
              how='left', 
              on='start')

In [49]:
dataframe_tokens.fillna('O', inplace=True)

In [53]:
dataframe_tokens.columns = ['id', 'start', 'end', 'token', 'label']

In [55]:
dataframe_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,4,cell,O
1,1,5,15,8099948528,B-PHONE
2,2,16,18,ga,O
3,3,19,29,8466045457,O
4,4,30,35,email,O
5,5,36,57,lictsrikant@gmail.com,B-EMAIL
6,6,58,62,life,B-ORG
7,7,63,72,insurance,I-ORG
8,8,73,84,corporation,I-ORG
9,9,85,87,of,I-ORG


In [58]:
df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1
df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis=1)

df_clean.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1
  df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis=1)


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start
12,5,1,3,1,1,1,722,53,64,28,93.027191,cell,4,0
14,5,1,3,1,1,3,822,53,203,28,96.644051,8099948528,15,5
17,5,1,3,2,1,1,55,55,85,89,47.185547,ga,18,16
18,5,1,3,2,1,2,822,95,203,28,96.643845,8466045457,29,19
20,5,1,3,2,2,1,593,136,93,25,89.28978,email,35,30
22,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36
25,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58
26,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63
27,5,1,3,3,1,3,183,170,117,14,96.909729,corporation,84,73
28,5,1,3,3,1,4,306,170,20,14,96.172005,of,87,85


In [61]:
data_info = pd.merge(df_clean, 
         dataframe_tokens[['start', 'token', 'label']], 
         how='inner', 
         on='start')

data_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
39,5,1,6,3,1,6,495,474,53,21,96.027008,018.,305,301,018,O
40,5,1,6,3,2,1,46,506,430,27,85.618164,"lictsrikant8099948528.blogspot.in,",340,306,lictsrikant8099948528.blogspot.in,I-EMAIL
41,5,1,6,3,2,2,488,506,429,27,89.912384,interviewsinhyderabad.blogspot.in,374,341,interviewsinhyderabad.blogspot.in,O
42,5,1,6,3,3,1,44,539,472,25,87.967415,"facebook.comlictsrikant8099948528,",409,375,facebook.comlictsrikant8099948528,O
43,5,1,6,3,3,2,526,539,443,22,90.545654,facebook.comthathineni.srikanth.9,443,410,facebook.comthathineni.srikanth.9,O
44,5,1,6,3,4,1,46,571,106,21,96.344551,promote,451,444,promote,O
45,5,1,6,3,4,2,161,576,56,22,95.835091,your,456,452,your,O
46,5,1,6,3,4,3,226,571,111,21,96.409111,business,465,457,business,O
47,5,1,6,3,4,4,347,571,74,21,93.162605,online,472,466,online,O
48,5,1,6,3,4,5,432,571,96,27,92.261208,pybo,477,473,pybo,O


# Creating BBOX

In [63]:
bbox = data_info.query("label != 'O' ")
image = img.copy()

for x, y, w, h, label in bbox[['left', 'top', 'width', 'height', 'label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.putText(image, str(label), (x, y), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0), 2)

cv2.imshow('Predictions', image)
cv2.waitKey(5)
cv2.destroyAllWindows()

In [64]:
bbox

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
1,5,1,3,1,1,3,822,53,203,28,96.644051,8099948528,15,5,8099948528,B-PHONE
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,B-EMAIL
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,B-ORG
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,I-ORG
8,5,1,3,3,1,3,183,170,117,14,96.909729,corporation,84,73,corporation,I-ORG
9,5,1,3,3,1,4,306,170,20,14,96.172005,of,87,85,of,I-ORG
10,5,1,3,3,1,5,332,170,42,14,96.796776,india,93,88,india,I-ORG
11,5,1,3,3,1,6,668,163,357,46,91.289627,seosrikantht@gmail.com,116,94,seosrikantht@gmail.com,B-EMAIL
12,5,1,4,1,1,1,310,228,232,30,92.664276,thathineni,127,117,thathineni,B-NAME
13,5,1,4,1,1,2,557,227,198,32,96.249886,srikanth,136,128,srikanth,I-NAME


In [65]:
bbox['label'] = bbox['label'].apply(lambda x: x[2:])
bbox.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bbox['label'] = bbox['label'].apply(lambda x: x[2:])


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
1,5,1,3,1,1,3,822,53,203,28,96.644051,8099948528,15,5,8099948528,PHONE
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,ORG
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,ORG
8,5,1,3,3,1,3,183,170,117,14,96.909729,corporation,84,73,corporation,ORG
9,5,1,3,3,1,4,306,170,20,14,96.172005,of,87,85,of,ORG
10,5,1,3,3,1,5,332,170,42,14,96.796776,india,93,88,india,ORG
11,5,1,3,3,1,6,668,163,357,46,91.289627,seosrikantht@gmail.com,116,94,seosrikantht@gmail.com,EMAIL
12,5,1,4,1,1,1,310,228,232,30,92.664276,thathineni,127,117,thathineni,NAME
13,5,1,4,1,1,2,557,227,198,32,96.249886,srikanth,136,128,srikanth,NAME


In [66]:
# grouping labels

class GroupGen:
    def __init__(self):
        self.id = 0
        self.text = ''

    def getgroup(self, text):
        if self.text == text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id

grp_gen = GroupGen()

In [67]:
bbox['group'] = bbox['label'].apply(grp_gen.getgroup)

bbox

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bbox['group'] = bbox['label'].apply(grp_gen.getgroup)


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group
1,5,1,3,1,1,3,822,53,203,28,96.644051,8099948528,15,5,8099948528,PHONE,1
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL,2
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,ORG,3
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,ORG,3
8,5,1,3,3,1,3,183,170,117,14,96.909729,corporation,84,73,corporation,ORG,3
9,5,1,3,3,1,4,306,170,20,14,96.172005,of,87,85,of,ORG,3
10,5,1,3,3,1,5,332,170,42,14,96.796776,india,93,88,india,ORG,3
11,5,1,3,3,1,6,668,163,357,46,91.289627,seosrikantht@gmail.com,116,94,seosrikantht@gmail.com,EMAIL,4
12,5,1,4,1,1,1,310,228,232,30,92.664276,thathineni,127,117,thathineni,NAME,5
13,5,1,4,1,1,2,557,227,198,32,96.249886,srikanth,136,128,srikanth,NAME,5


In [70]:
# right and bottom of bbox
bbox[['left', 'top', 'width', 'height']] = bbox[['left', 'top', 'width', 'height']].astype(int)
bbox['right'] = bbox['left'] + bbox['width']
bbox['bottom'] = bbox['top'] + bbox['height']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bbox[['left', 'top', 'width', 'height']] = bbox[['left', 'top', 'width', 'height']].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bbox['right'] = bbox['left'] + bbox['width']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bbox['bottom'] = bbox['top'] + bbox['height']


In [74]:
# tagging : group by groups
columns_group = ['left', 'top', 'right', 'bottom', 'label', 'token', 'group']
group_tag_img = bbox[columns_group].groupby(by='group')
img_tagging = group_tag_img.agg({
    'left': min, 
    'right': max, 
    'top': min, 
    'bottom': max, 
    'label': np.unique, 
    'token': lambda x: " ".join(x)})

img_tagging

  img_tagging = group_tag_img.agg({
  img_tagging = group_tag_img.agg({


Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,822,1025,53,81,[PHONE],8099948528
2,709,1025,136,167,[EMAIL],lictsrikant@gmail.com
3,46,374,151,193,[ORG],life insurance corporation of india
4,668,1025,163,209,[EMAIL],seosrikantht@gmail.com
5,310,755,227,259,[NAME],thathineni srikanth
6,399,669,271,296,[DES],insurance advisor
7,47,882,395,427,[ORG],life insurance corporation of india
8,46,476,506,533,[EMAIL],lictsrikant8099948528.blogspot.in


In [78]:
img_bb = img.copy()
for l, r, t, b, label, token in img_tagging.values:
    cv2.rectangle(img_bb, (l, t), (r, b), (0, 255, 0), 2)
    cv2.putText(img_bb, str(label), (l, t), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 255), 2)

cv2.imshow('BBOX for business card', img_bb)
cv2.waitKeyEx(0)
cv2.destroyAllWindows()

2023-12-07 00:24:27.327 python[3629:191997] IMKClient Stall detected, *please Report* your user scenario attaching a spindump (or sysdiagnose) that captures the problem - (imkxpc_bundleIdentifierWithReply:) block performed very slowly (12.96 secs).


# Parser

In [85]:
def parser(text, label):
    if label == "PHONE":
        text = text.lower()
        text = re.sub(r'\D', '', text)
    elif label == "EMAIL":
        text = text.lower()
        allow_special_char = "@_.\-"
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char), '', text)
    elif label == "WEB":
        text == text.lower()
        allow_special_char = ":/.%#\-"
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char), '', text)
    elif label in ("DES", "NAME"):
        text == text.lower()
        text = re.sub(r'[^A-Za-z ]', '', text)
        text = text.title()
    elif label == "ORG":
        text == text.lower()
        text = re.sub(r'[^a-z0-9 ]', '', text)
        text = text.title()
    return text

In [88]:
parser("Anton_1988@sdsd.ry", "EMAIL")

'anton_1988@sdsd.ry'

# Entities

In [94]:
info_array = data_info[['token', 'label']].values
entities = dict(
    NAME=[], ORG=[], DES=[], PHONE=[], EMAIL=[], WEB=[])
previous = "O"

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]
    text = parser(token, label_tag)
    if bio_tag in ('B', 'I'):
        if previous != label_tag:
            entities[label_tag].append(text)
        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
            else:
                if label_tag in ("NAME", "ORG", "DES"):
                    entities[label_tag][-1] += " " + text
                else:
                    entities[label_tag][-1] += + text
    previous = label_tag
            
    

In [96]:
entities

{'NAME': ['Thathineni Srikanth'],
 'ORG': ['Life Insurance Corporation Of India',
  'Life Insurance Corporation Of India'],
 'DES': ['Insurance Advisor'],
 'PHONE': ['8099948528'],
 'EMAIL': ['lictsrikant@gmail.com',
  'seosrikantht@gmail.com',
  'lictsrikant8099948528.blogspot.in'],
 'WEB': []}

In [98]:
#jupyter nbconvert --to script 'predict.ipynb'