In [2]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
import os
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Merge all cards to one df

In [15]:
img_paths = glob("./Selected/*.jpeg")

In [16]:
all_cards = pd.DataFrame(columns=["id", "text"])

for img_path in tqdm(img_paths, desc="Business Card"):
    # extract data and text
    _, filename = os.path.split(img_path)
    img = cv2.imread(img_path)
    data = pytesseract.image_to_data(img)
    datalist = list(map(lambda x: x.split("\t"), data.split("\n")))
    df = pd.DataFrame(datalist[1:], columns=datalist[0])
    df.dropna(inplace=True)
    df["conf"] = df["conf"].astype(float).astype(int)
    # threshold with conf 
    usefull_data = df.query("conf >= 30")
    # temp df to concat
    business_card = pd.DataFrame()
    business_card["text"] = usefull_data["text"]
    business_card["id"] = filename
    # concat to the whole df
    all_cards = pd.concat((all_cards, business_card))

Business Card: 100%|██████████████████████████| 293/293 [02:35<00:00,  1.88it/s]


In [19]:
all_cards.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10122 entries, 4 to 82
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10122 non-null  object
 1   text    10122 non-null  object
dtypes: object(2)
memory usage: 237.2+ KB


In [20]:
all_cards.to_csv("business_cards.csv", index=False)

In [27]:
# get labeling

all_cards_labels = pd.read_csv("businessCard.csv", encoding="cp1251")
all_cards_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10445 entries, 0 to 10444
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10445 non-null  object
 1   text    9363 non-null   object
 2   tag     10445 non-null  object
dtypes: object(3)
memory usage: 244.9+ KB


# Data preprocessing

In [28]:
import string
import re

In [29]:
with open('businessCard.txt', \
          mode='r', \
          encoding='utf8', \
          errors='ignore') as f:
    text = f.read()

print(text)

id	text	tag
000.jpeg	 	O
000.jpeg	.	O
000.jpeg	040-4852	B-PHONE
000.jpeg	"8881,"	I-PHONE
000.jpeg	90309	B-PHONE
000.jpeg	52549	I-PHONE
000.jpeg	Fi	O
000.jpeg	/laurelsoverseaseducation	O
000.jpeg	@:	O
000.jpeg	LAURELS	B-ORG
000.jpeg	OVERSEAS	I-ORG
000.jpeg	EDUCATIONAL	I-ORG
000.jpeg	CONSULTANCY	I-ORG
000.jpeg	PVT.	I-ORG
000.jpeg	LTD.	I-ORG
000.jpeg	Sea	O
000.jpeg	|	O
000.jpeg	U.K	O
000.jpeg	AUSTRALIA	O
000.jpeg	CANADA	O
000.jpeg	IRELAND	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	www.laurelseducation.com	B-WEB
000.jpeg	)%info@laurelseducation.com	B-EMAIL
000.jpeg	 	O
001.jpeg	john	B-NAME
001.jpeg	smith	I-NAME
001.jpeg	marketing	B-DES
001.jpeg	manager	I-DES
001.jpeg	web:	O
001.jpeg	www.psdgraphics.com	B-WEB
001.jpeg	phone:	O
001.jpeg	123-456-7890	B-PHONE
001.jpeg	mail:	O
001.jpeg	email@psdgraphics.com	B-EMAIL
002.jpeg	    	O
002.jpeg	   	O
002.jpeg	Sau	O
002.jpeg	0	O
002.jpeg	98489	B-PHONE
002.jpeg	24441	I-PHONE
002.jpeg	dy	O
002.jpeg	"08672,"

In [42]:
data = list(map(lambda x: x.split('\t'), text.split('\n')))
df = pd.DataFrame(data[1:], columns=data[0])
df.head()

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE


## Preprocess
* remove whitespaces
* remove special characters

In [43]:
whitespace = string.whitespace
punctuation = '!#$%&\'()*+/:;<=>?[\\]^`{|}~'
table_whitespace = str.maketrans('', '', whitespace)
table_puctuation = str.maketrans('', '', punctuation)

def cleanText(text):
    text = str(text)
    text = text.lower()
    removeWhitespace = text.translate(table_whitespace)
    removePunctuation = removeWhitespace.translate(table_puctuation)

    return str(removePunctuation)

In [44]:
df['text'] = df['text'].apply(cleanText)

In [45]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

In [48]:
dataClean.head(10)

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,fi,O
7,000.jpeg,laurelsoverseaseducation,O
8,000.jpeg,@,O
9,000.jpeg,laurels,B-ORG
10,000.jpeg,overseas,I-ORG


## Convert to spacy required format

In [52]:
group = dataClean.groupby(by='id')
group.groups.keys()

dict_keys(['000.jpeg', '001.jpeg', '002.jpeg', '003.jpeg', '004.jpeg', '007.jpeg', '008.jpeg', '009.jpeg', '010.jpeg', '011.jpeg', '012.jpeg', '013.jpeg', '014.jpeg', '015.jpeg', '016.jpeg', '017.jpeg', '018.jpeg', '020.jpeg', '021.jpeg', '022.jpeg', '023.jpeg', '024.jpeg', '025.jpeg', '027.jpeg', '028.jpeg', '030.jpeg', '031.jpeg', '032.jpeg', '033.jpeg', '034.jpeg', '035.jpeg', '036.jpeg', '037.jpeg', '038.jpeg', '039.jpeg', '040.jpeg', '041.jpeg', '042.jpeg', '043.jpeg', '044.jpeg', '045.jpeg', '047.jpeg', '048.jpeg', '049.jpeg', '050.jpeg', '051.jpeg', '052.jpeg', '053.jpeg', '054.jpeg', '055.jpeg', '056.jpeg', '057.jpeg', '058.jpeg', '059.jpeg', '060.jpeg', '061.jpeg', '062.jpeg', '063.jpeg', '064.jpeg', '065.jpeg', '066.jpeg', '067.jpeg', '068.jpeg', '069.jpeg', '070.jpeg', '071.jpeg', '072.jpeg', '073.jpeg', '074.jpeg', '075.jpeg', '076.jpeg', '078.jpeg', '079.jpeg', '080.jpeg', '081.jpeg', '082.jpeg', '083.jpeg', '084.jpeg', '085.jpeg', '086.jpeg', '087.jpeg', '088.jpeg', '089.

In [64]:
cards = group.groups.keys()

In [65]:
allCardsData = []

for card in cards:
    cardData = []
    group_array = group.get_group(card)[['text', 'tag']].values
    content = ''
    annotations = {'entities': []}
    start = 0
    end = 0
    
    for text, label in group_array:
        #print(text, label)
        text = str(text)
        str_len = len(text) + 1
    
        start = end
        end = start + str_len
    
        if label != 'O':
            annot = (start, end - 1, label)
            annotations['entities'].append(annot)
    
        content += text + ' '

    cardData = (content, annotations)
    allCardsData.append(cardData)

In [66]:
allCardsData

[('. 040-4852 "8881," 90309 52549 fi laurelsoverseaseducation @ laurels overseas educational consultancy pvt. ltd. sea u.k australia canada ireland www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 10, 'B-PHONE'),
    (11, 18, 'I-PHONE'),
    (19, 24, 'B-PHONE'),
    (25, 30, 'I-PHONE'),
    (61, 68, 'B-ORG'),
    (69, 77, 'I-ORG'),
    (78, 89, 'I-ORG'),
    (90, 101, 'I-ORG'),
    (102, 106, 'I-ORG'),
    (107, 111, 'I-ORG'),
    (145, 169, 'B-WEB'),
    (170, 195, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 123-456-7890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 71, 'B-PHONE'),
    (77, 98, 'B-EMAIL')]}),
 ('sau 0 98489 24441 dy "08672," 224441 enkateswapa wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (37, 48, 'B-ORG')]}),
 ('prasad @ "9,96,31,73,53,59,49,04,00,000" i flex design album des

# Split into train / test sets

In [68]:
import random
import pickle

random.shuffle(allCardsData)
print(len(allCardsData))

267


In [69]:
trainSet = allCardsData[: 240]
testSet = allCardsData[240: ]

In [70]:
#!mkdir data

pickle.dump(trainSet, open('./data/TrainData.pickle', mode='wb'))
pickle.dump(testSet, open('./data/TestData.pickle', mode='wb'))

In [74]:
# train mode of spacy
#python -m spacy train config.cfg --output ./output \
#--paths.train ./data/train.spacy --paths.dev ./data/test.spacy
