In [4]:
import numpy as np
import pandas as pd
import cv2
import PIL
import pytesseract
from glob import glob
import spacy
import matplotlib.pyplot as plt
import re
import string

In [7]:

def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans("", "", whitespace)
    tablePunctuation = str.maketrans("", "", punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    return str(removepunctuation)


In [19]:
# Load NER model
model_ner=spacy.load('output/model-last')

In [124]:
# Load Image
image = cv2.imread('demo.png')
# To show image
cv2.imshow('demo',image)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Extract data using pytesseract
tessData=pytesseract.image_to_data(image)
# Convert data into dataframe
tessList = list(map(lambda x: x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:], columns=tessList[0])
df.dropna(inplace=True) # drop missing values
df['text']=df['text'].apply(cleanText)

# Convert data into content
df_clean = df.query('text != "" ') # take only clean text that is not empty
content = " ".join([w for w in df_clean['text']])


# Get predictions from NER model

doc = model_ner(content)

In [125]:
# To visualize using spacy displacy
from spacy import displacy

In [126]:
displacy.render(doc, style='ent')

In [127]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [128]:
docjson['tokens']

[{'id': 0, 'start': 0, 'end': 1},
 {'id': 1, 'start': 2, 'end': 4},
 {'id': 2, 'start': 4, 'end': 5},
 {'id': 3, 'start': 5, 'end': 15},
 {'id': 4, 'start': 15, 'end': 16},
 {'id': 5, 'start': 17, 'end': 20},
 {'id': 6, 'start': 20, 'end': 21},
 {'id': 7, 'start': 22, 'end': 28},
 {'id': 8, 'start': 29, 'end': 37},
 {'id': 9, 'start': 38, 'end': 41},
 {'id': 10, 'start': 41, 'end': 42},
 {'id': 11, 'start': 43, 'end': 47},
 {'id': 12, 'start': 47, 'end': 48},
 {'id': 13, 'start': 49, 'end': 53},
 {'id': 14, 'start': 53, 'end': 54},
 {'id': 15, 'start': 55, 'end': 56},
 {'id': 16, 'start': 57, 'end': 68},
 {'id': 17, 'start': 69, 'end': 77},
 {'id': 18, 'start': 78, 'end': 81},
 {'id': 19, 'start': 82, 'end': 92},
 {'id': 20, 'start': 93, 'end': 103},
 {'id': 21, 'start': 104, 'end': 105},
 {'id': 22, 'start': 106, 'end': 109},
 {'id': 23, 'start': 110, 'end': 113},
 {'id': 24, 'start': 114, 'end': 116},
 {'id': 25, 'start': 117, 'end': 119},
 {'id': 26, 'start': 120, 'end': 121},
 {'id

In [129]:
doc_text = docjson['text']
dataframe_tokens = pd.DataFrame(docjson['tokens'])
dataframe_tokens.head()

Unnamed: 0,id,start,end
0,0,0,1
1,1,2,4
2,2,4,5
3,3,5,15
4,4,15,16


In [130]:
dataframe_tokens['token'] = dataframe_tokens[['start','end']].apply(lambda x:doc_text[x[0]:x[1]], axis=1)
dataframe_tokens.head(20)

  dataframe_tokens['token'] = dataframe_tokens[['start','end']].apply(lambda x:doc_text[x[0]:x[1]], axis=1)


Unnamed: 0,id,start,end,token
0,0,0,1,©
1,1,2,4,91
2,2,4,5,-
3,3,5,15,8830952270
4,4,15,16,°
5,5,17,20,adv
6,6,20,21,.
7,7,22,28,ahilya
8,8,29,37,nalawade
9,9,38,41,b.a


In [131]:
pd.DataFrame(docjson['ents'])[['start','label']]

Unnamed: 0,start,label
0,0,I-PHONE
1,2,B-PHONE
2,69,B-ORG
3,78,I-ORG
4,260,B-EMAIL


In [132]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
right_table

Unnamed: 0,start,label
0,0,I-PHONE
1,2,B-PHONE
2,69,B-ORG
3,78,I-ORG
4,260,B-EMAIL


In [133]:
dataframe_tokens = pd.merge(dataframe_tokens, right_table, how='left', on='start')

In [134]:
dataframe_tokens

Unnamed: 0,id,start,end,token,label
0,0,0,1,©,I-PHONE
1,1,2,4,91,B-PHONE
2,2,4,5,-,
3,3,5,15,8830952270,
4,4,15,16,°,
...,...,...,...,...,...
64,64,241,248,rankala,
65,65,248,249,",",
66,66,250,258,kolhapur,
67,67,258,259,.,


In [135]:
dataframe_tokens.fillna('O', inplace=True)

In [136]:
dataframe_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,1,©,I-PHONE
1,1,2,4,91,B-PHONE
2,2,4,5,-,O
3,3,5,15,8830952270,O
4,4,15,16,°,O
5,5,17,20,adv,O
6,6,20,21,.,O
7,7,22,28,ahilya,O
8,8,29,37,nalawade,O
9,9,38,41,b.a,O


For drawing the bounding box we need left, right, top and confidence score of the bounding box info from df_clean


In [137]:
# ASSUMING THAT EACH WORD IS SEPARATED BY ONE SPACE
df_clean.head(5)
df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1


In [138]:
df_clean.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end
8,5,1,2,1,1,1,1513,38,95,110,0.862961,©,1
9,5,1,2,1,1,2,1655,55,810,98,74.315201,91-8830952270°,16
13,5,1,3,1,1,1,731,384,261,106,92.558205,adv.,21
14,5,1,3,1,1,2,1037,388,469,104,91.801567,ahilya,28
15,5,1,3,1,1,3,1553,391,739,105,90.126564,nalawade,37
17,5,1,3,1,2,1,1251,573,373,134,6.284714,b.a.,42
18,5,1,3,1,2,2,1667,574,252,86,6.284714,ll.b.,48
19,5,1,3,1,2,3,1956,572,335,114,89.698112,hons.,54
23,5,1,4,1,1,1,376,819,2,3,74.782715,_,56
24,5,1,4,1,1,2,466,823,467,105,96.897415,maharashtra,68


In [139]:
df_clean["start"] = df_clean[["text", "end"]].apply(lambda x: x[1] - len(x[0]), axis=1)

  df_clean["start"] = df_clean[["text", "end"]].apply(lambda x: x[1] - len(x[0]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["start"] = df_clean[["text", "end"]].apply(lambda x: x[1] - len(x[0]), axis=1)


In [140]:
df_clean.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start
8,5,1,2,1,1,1,1513,38,95,110,0.862961,©,1,0
9,5,1,2,1,1,2,1655,55,810,98,74.315201,91-8830952270°,16,2
13,5,1,3,1,1,1,731,384,261,106,92.558205,adv.,21,17
14,5,1,3,1,1,2,1037,388,469,104,91.801567,ahilya,28,22
15,5,1,3,1,1,3,1553,391,739,105,90.126564,nalawade,37,29
17,5,1,3,1,2,1,1251,573,373,134,6.284714,b.a.,42,38
18,5,1,3,1,2,2,1667,574,252,86,6.284714,ll.b.,48,43
19,5,1,3,1,2,3,1956,572,335,114,89.698112,hons.,54,49
23,5,1,4,1,1,1,376,819,2,3,74.782715,_,56,55
24,5,1,4,1,1,2,466,823,467,105,96.897415,maharashtra,68,57


In [141]:
# inner join with start as the common column for join
dataframe_info = pd.merge(df_clean, dataframe_tokens[['start','token','label']], how='inner', on='start')

In [142]:
dataframe_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
42,5,1,6,1,1,7,1438,1101,78,62,88.115845,‘c’,211,208,‘,O
43,5,1,6,1,1,8,1544,1103,185,79,96.98378,"wing,",217,212,wing,O
44,5,1,6,1,1,9,1761,1107,230,64,93.305153,behind,224,218,behind,O
45,5,1,6,1,1,10,2020,1110,49,60,93.122993,d,226,225,d,O
46,5,1,6,1,1,11,2096,1111,150,63,96.961861,mart,231,227,mart,O
47,5,1,6,1,1,12,2270,1114,154,70,74.981071,"mall,",237,232,mall,O
48,5,1,6,1,2,1,2,1185,133,64,21.179123,be,240,238,be,O
49,5,1,6,1,2,2,972,1189,282,74,92.60656,"rankala,",249,241,rankala,O
50,5,1,6,1,2,3,1286,1193,310,78,95.528534,kolhapur.,259,250,kolhapur,O
51,5,1,7,1,1,1,732,1295,1276,122,72.984489,adv.ahilyanalawade07@gmail.com,290,260,adv.ahilyanalawade07@gmail.com,B-EMAIL


Bounding Box


In [143]:
bb_df = dataframe_info.query(' label != "O"')
img = image.copy()

for x, y, w, h, label in bb_df[["left", "top", "width", "height", "label"]].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)

    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.putText(img, str(label), (x, y), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 255), 2)

cv2.imshow("Predictions", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [144]:
# Combined bounding box
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])


In [145]:
bb_df.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,2,1,1,1,1513,38,95,110,0.862961,©,1,0,©,PHONE
1,5,1,2,1,1,2,1655,55,810,98,74.315201,91-8830952270°,16,2,91,PHONE
10,5,1,4,1,1,3,963,827,300,84,96.493385,national,77,69,national,ORG
11,5,1,4,1,1,4,1295,830,147,74,96.818825,law,81,78,law,ORG
51,5,1,7,1,1,1,732,1295,1276,122,72.984489,adv.ahilyanalawade07@gmail.com,290,260,adv.ahilyanalawade07@gmail.com,EMAIL


In [146]:
# Grouping the labels of one type eg: PHONE
class groupgen():
    def __init__(self):
        self.id=0
        self.text=''

    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id += 1
            self.text=text
            return self.id

In [147]:
grp_gen = groupgen()

In [148]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)


In [149]:
bb_df.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group
0,5,1,2,1,1,1,1513,38,95,110,0.862961,©,1,0,©,PHONE,1
1,5,1,2,1,1,2,1655,55,810,98,74.315201,91-8830952270°,16,2,91,PHONE,1
10,5,1,4,1,1,3,963,827,300,84,96.493385,national,77,69,national,ORG,2
11,5,1,4,1,1,4,1295,830,147,74,96.818825,law,81,78,law,ORG,2
51,5,1,7,1,1,1,732,1295,1276,122,72.984489,adv.ahilyanalawade07@gmail.com,290,260,adv.ahilyanalawade07@gmail.com,EMAIL,3


In [150]:
# Creating right and bottom of bounding box
bb_df[['left','width','top','height']] = bb_df[['left','width','top','height']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df[['left','width','top','height']] = bb_df[['left','width','top','height']].astype(int)


In [151]:
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['right'] = bb_df['left'] + bb_df['width']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['bottom'] = bb_df['top'] + bb_df['height']


In [152]:
# tagging: groupby group
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')

In [153]:
group_tag_img

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025E9DF9CC20>

In [154]:
img_tagging = group_tag_img.agg({
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token': lambda x: " ".join(x)

})

  img_tagging = group_tag_img.agg({
  img_tagging = group_tag_img.agg({


In [155]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1513,2465,38,153,[PHONE],© 91
2,963,1442,827,911,[ORG],national law
3,732,2008,1295,1417,[EMAIL],adv.ahilyanalawade07@gmail.com


In [158]:
img_bb = image.copy()
for l, r, t, b, label, token in img_tagging.values:
    cv2.rectangle(img_bb, (l, t), (r, b), (0, 255, 0), 2)

    cv2.putText(
        img_bb, str(label[0]), (l, t), cv2.FONT_HERSHEY_PLAIN, 2, (255, 0, 255), 2
    )

# Define the window name
window_name = "Bounding Box Business Card"

# Set the window properties
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)

# Resize the window
cv2.resizeWindow(window_name, 900, 600)  # Adjust the dimensions as needed

# Display the image
cv2.imshow(window_name, img_bb)

# Wait for a key press and close the window
cv2.waitKey(0)
cv2.destroyAllWindows()

Parsing Function


In [117]:
def parser(text, label):
    if label == "PHONE":
        text = text.lower()
        text = re.sub(r"\D", "", text)

    elif label == "EMAIL":
        text = text.lower()
        allow_special_char = r"@-_."
        text = re.sub(r"[^A-Za-z0-9{} ]".format(allow_special_char), "", text)

    elif label == "WEB":
        text = text.lower()
        allow_special_char = r":/.%#-"
        text = re.sub(r"[^A-Za-z0-9{} -]".format(allow_special_char), "", text)

    elif label in ("NAME", "DES"):
        text = text.lower()
        text = re.sub(r"[^a-z ]", "", text)
        text = text.title()

    elif label == "ORG":
        text = text.lower()
        text = re.sub(r"[^a-z0-9 ]", "", text)
        text = text.title()

    return text

In [122]:
parser(r"arjun\@gmail.com", "EMAIL")

'arjun\\@gmail.com'

Entities


In [116]:
info_array = dataframe_info[["token", "label"]].values
entities = dict(NAME=[], ORG=[], DES=[], PHONE=[], WEB=[], EMAIL=[])
previous = "O"

for token, label in info_array:
    bio_tag = label[:1]
    label_tag = label[2:]

    # step 1 parse the token
    text = parser(token, label_tag)

    if bio_tag in ("B", "I"):

        if previous != label_tag:
            entities[label_tag].append(text)

        else:
            if bio_tag == "B":
                entities[label_tag].append(text)

            else:
                if label_tag in ("NAME", "ORG", "DES"):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text

    previous = label_tag

  allow_special_char = '@-_\.'


error: bad character range #-  at position 15