In [1]:
import os
from shutil import unpack_archive

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
unpack_archive("/content/drive/MyDrive/Colab Notebooks/RZ/Demo Day/OCR.zip")

In [4]:
for dirname, _, filenames in os.walk('./ImageAndXML_Data'):
    filepath = dirname
    list_filename = filenames
    # for filename in filenames:
    #    print(os.path.join(dirname, filename))

# Accessing the directory with all the files and all the filenames

In [5]:
print(filepath)
print(list_filename)

./ImageAndXML_Data
['2041532206_gt.xml', 'ti16400164.tif', '99380808_0809_gt.xml', '11002591_ocr.xml', '2063610122_ocr.xml', '2063610122.tif', '00555341.tif', '89703081_gt.xml', '2080705096_gt.xml', '80233716_gt.xml', '2042525639_ocr.xml', '86463030_ocr.xml', '2072957946_ocr.xml', '82898529_gt.xml', '0060203682_gt.xml', '87021931_ocr.xml', '2041597671_2041597672_ocr.xml', '526509181+-9181_gt.xml', '86018092.tif', '2041222828_ocr.xml', '0060076201.tif', '2051801680.tif', '2043342113_ocr.xml', '2041532206.tif', '2029377295.tif', '2083197818_7819.tif', '91543951_3952_ocr.xml', '2029377295_ocr.xml', '0000223278_ocr.xml', '2063322469_gt.xml', '0060027402_gt.xml', 'ti16310545.tif', '87147464.tif', '2063650388_ocr.xml', '2040687664_gt.xml', '2065216982_gt.xml', '2044696237_gt.xml', '83565599.tif', '87803300_gt.xml', '2040345058.tif', '2063235029.tif', '0001136521.tif', '526450353+-0353.tif', '0060094811_gt.xml', '500234635+-4635_ocr.xml', '2063576893_ocr.xml', '2070584510_gt.xml', '2029370464

# XML Parsing

XML parsing in python is done either by using **lxml parser along with beautiful soup or using elementtree library**. Choose the one which suits your needs better.

Our aim is to read the text data from XML file and put the information of all the files in a text file. First we read a file and try to get the text from a single file then we loop over all the files to get the text from all the files.

In [6]:
#For making list of all the *_OCR.xml files in the data folder use glob or fnmatch+os.listdir()
import fnmatch

file_list = []
for file in list_filename:
    if fnmatch.fnmatch(file,'*_ocr.xml'):
        file_list.append(file)    

In [7]:
#Let's try to open one file and see how well does it help us:
with open(os.path.join(dirname,file_list[0]),'r') as f:
    data = f.read()
f.close()    

In [8]:
print(data)

<?xml version="1.0" encoding="utf-8"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15">
  <Metadata>
    <Creator>ABBYY FineReader Engine 11 + alto2page.xslt 2018.11.09</Creator>
    <Created>2019-01-23T00:00:00</Created>
    <LastChange>2019-01-23T00:00:00</LastChange>
  </Metadata>
  <Page imageFilename="11002591.tif" imageHeight="1000" imageWidth="771">
    <TextRegion id="Page1_TopMargin">
      <Property key="Margin" value="Top"/>
      <Coords points="0,0 771,0 771,28 0,28"/>
    </TextRegion>
    <TextRegion id="Page1_LeftMargin">
      <Property key="Margin" value="Left"/>
      <Coords points="0,28 51,28 51,902 0,902"/>
    </TextRegion>
    <TextRegion id="Page1_RightMargin">
      <Property key="Margin" value="Right"/>
      <Coords points="751,28 771,28 771,902 751,902"/>
    </TextRegion>
    <TextRegion id="Page1_BottomMargin">
      <Property key="Margin" value="Bottom"/>
      <Coords points="0,902 771,902 771,1000 0,1000"/>
    </TextRegi

# Observations:

```
# The structure of xml(for our interest) is:
# <TextRegion>
#     <TextLine>
#         <Word>
#             <Unicode> Text </Unicode>
#         </Word>
#     </TextLine>
# </TextRegion>
```

**Information in a particular text region (< TextRegion >) is a block of information which is of the same type like address or description. But this block of informaiton may be divided into different lines (< TextLine >).**

While reading we need to club information according to text line and text region to make it more reasonable so that sequential property of the text data could be preserved. Opposed to this a normal final_all text in xml would give non sequential data where sequential property of the data will be lost. Information from XML can be extracted row wise, block wise or all the text altogether. Row wise text is smaller in size and contains less sequential information so it will be better to use if for regex comparison. While whole text together or block wise information is expected to be in more sequential manner and thus more suitable for unsupervised learning methods (not supervise because of no labeled data).

NOTE: From xml we are currently not taking the confidence into account, for a better model confidence threshold should be decided/optimized and used.

Parsing the XML file using element tree:

In [9]:
from lxml import etree, objectify

parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(os.path.join(dirname,file_list[0]))
root = tree.getroot()

#This file contains xml name space which come attached with each tag, 
#xmlns makes it difficult for a reader to focus on desired tag, so we remove the namespaces
for elem in root.getiterator():
    if not hasattr(elem.tag, 'find'): continue  
    i = elem.tag.find('}')
    if i >= 0:
        elem.tag = elem.tag[i+1:]
objectify.deannotate(root, cleanup_namespaces=True)

In [10]:
#all the tag in xml are added to tag_list 
tag_list = []
for element in root.iter():
    tag_list = element.tag

In [11]:
# check the output if its in the same order as we need it
for element in root.iter('Word'):
    print('block and line info = ' + str(element.attrib))
    for child in element:
        #print(child.tag)
        if child.tag == 'TextEquiv':
            print('for text confidence score =' + str(child.attrib))
            for grandchild in child:
                if grandchild.tag == 'Unicode':
                    print('text = '+str(grandchild.text))

block and line info = {'id': 'Page1_Block1_l1_w1'}
for text confidence score ={'conf': '0.473333329'}
text = 8**
block and line info = {'id': 'Page1_Block1_l1_w2'}
for text confidence score ={'conf': '0.6075000167'}
text = ICO^
block and line info = {'id': 'Page1_Block2_l1_w1'}
for text confidence score ={'conf': '0.150000006'}
text = f
block and line info = {'id': 'Page1_Block2_l1_w2'}
for text confidence score ={'conf': '0.2599999905'}
text = *
block and line info = {'id': 'Page1_Block4_l1_w1'}
for text confidence score ={'conf': '0.6342856884'}
text = INVOICE
block and line info = {'id': 'Page1_Block4_l1_w2'}
for text confidence score ={'conf': '0.5433333516'}
text = NO.
block and line info = {'id': 'Page1_Block4_l2_w1'}
for text confidence score ={'conf': '0.4099999964'}
text = B-07025
block and line info = {'id': 'Page1_Block5_l1_w1'}
for text confidence score ={'conf': '0.337500006'}
text = SOLO
block and line info = {'id': 'Page1_Block5_l1_w2'}
for text confidence score ={'conf'

In [12]:
# getting the information in the format in which we need it:
import re
prev_block_Page = ''
prev_block_Block = ''
prev_block_Line = ''
prev_block_Word = ''
sentence = ''
sentence2 = ''
sentence3 = ''
block = []
line = []
for element in root.iter('Unicode'):
    same_page = same_block = same_line = next_word = False
    parent_node = next(element.iterancestors('Word'))
    block_list = parent_node.attrib['id'].split('_')
    if(prev_block_Page == block_list[0] or prev_block_Page == ''):
        same_page = True
    if(prev_block_Block == block_list[1] or prev_block_Block == ''):
        same_block = True        
    if(prev_block_Line == block_list[2] or prev_block_Line == ''):
        same_line = True
    if(prev_block_Word == int(block_list[3][1:])-1 or prev_block_Word == ''):
        next_word = True
                    
    #only same line present in one sentnece:
    #Here we check if the sentence contains the keywords which we are looking for like 'Date'
    if same_line and same_block:
        sentence3 = sentence3 + re.sub("[^0-9a-zA-Z:,]+", ' ',element.text) + ' ' 
    else:
        line.append(sentence3)
        sentence3 = ''
        sentence3 = sentence3 + re.sub("[^0-9a-zA-Z:,]+", ' ',element.text)+ ' '
    
    #same block in one line:
    if same_block:
        sentence = sentence + re.sub("[^0-9a-zA-Z:,]+", ' ',element.text) + ' '
    else:
        block.append(sentence)
        sentence = ''
        sentence = sentence + re.sub("[^0-9a-zA-Z:,]+", ' ',element.text)+ ' '  
      
    #all text in same line:
    sentence2 = sentence2 + re.sub("[^0-9a-zA-Z:,]+", ' ',element.text) + ' '
        
    prev_block_Page = block_list[0]
    prev_block_Block = block_list[1]
    prev_block_Line = block_list[2]
    prev_block_Word = int(block_list[3][1:])
print(line)
print(block)    
print(sentence2)

['8  ICO  ', 'f   ', 'INVOICE NO  ', 'B 07025 ', 'SOLO TO ', 'The Council for Tobacco Research ', 'DATE ', '7 13 67 ', 'crcoiV do6c ', '033B ', 'XEBXiss : rr ', '633 Third Avenue ', 'Net 30 Days ', 'New York, New York 10017 ', 'Attention: Dr  R  C  Hockett ', 'DESTINATION ', 'CUSTOMER S ORDER NO  B L NO  ', 'BNPI  19 5 Subst i tut e ', '  O L WOT  ', 'F O B  ', 'CAR INTLS  ft NO  ', 'SHIPPIP ', 'IO POINT ', 'PNL ORDER NO  ', 'DATE SHIPPED ', 'COMMODITY ', 'Project Y 1 0153 ', 'Research on Methods of Measuring the ', 'Deposition of Inhaled Cigarette Smoke in Dogs ', 'CODE ', 'For Services Rendered ', 'June, 1967 ', 'Approved   Pt r 0 ', '  9  ', 'MAKE CHECK PAYABLE TO: ', 'CASHIER, PACIFIC NORTHWEST LABORATORIES ', 'REMIT TO  ', 'CASHIER, PACIFIC NORTHWEST LABORATORIES ', 'P  O  BOX 999, RICHLAND, WASH  99352 ', 'QUANTITY ', 'rt ', 'PRICK ', 'AMOUNT ', ' 2,755 55 ', 'PLEASE RETURN ONE COPY OF THIS INVOICE WITH REMITTANCE ', 'Q25420  ']
['8  ICO  ', 'f   ', 'INVOICE NO  B 07025 ', 'SOLO 

In [13]:
###########################################################################################
###########                                                                     ###########
########### This is the summary of complete process, looping over all the files ###########                      
###########                                                                     ###########
###########################################################################################
#Do the whole process on the list of all the files:
#Reading the file saving the contents in data and proceeding 

filename = 'beautiful_data.txt'
write_txt_to_file = open(filename,'a')
write_txt_to_file.write(sentence2)
write_txt_to_file.close()

In [14]:
import csv
field = ['TEXT']
row = sentence
filename = 'beautiful_data.csv'
with open(filename,'a',newline='') as csvfile:
    csvwriter = csv.writer(csvfile)#creates an object for writing in csv files
    csvwriter.writerow(sentence)

In [15]:
###########################################################################################
###########                                                                     ###########
########### This is the summary of complete process, looping over all the files ###########                      
###########                                                                     ###########
###########################################################################################
#Do the whole process on the list of all the files:
#Reading the file saving the contents in data and proceeding 

txt_filename = 'beautiful_data_summary.txt'
for xml_file in file_list:
    parser = None
    root = None
    tree = None
    new_sentence = ''
    parser = etree.XMLParser(remove_blank_text=True)
    xml_file_path = os.path.join(dirname,xml_file)
    tree = etree.parse(xml_file_path)
    root = tree.getroot()
    #remove the namespaces
    for elem in root.getiterator():
        if not hasattr(elem.tag, 'find'): continue  
        i = elem.tag.find('}')
        if i >= 0:
            elem.tag = elem.tag[i+1:]
    objectify.deannotate(root, cleanup_namespaces=True)
    
    for element in root.iter('Unicode'):
        new_sentence = new_sentence + re.sub("[^0-9a-zA-Z:,]+", ' ',element.text) + ' '
    #print('new_sentence = '+new_sentence)    
    #writing the new_sentence in the file 
    wrtie_txt_to_file = None
    write_txt_to_file = open(txt_filename,'a')
    write_txt_to_file.write(new_sentence)
    write_txt_to_file.write('\n')
    write_txt_to_file.close()
    

In [16]:
import pandas as pd
pd.set_option('display.max_colwidth',3000)
df = pd.read_csv('./beautiful_data_summary.txt',delimiter='/n',header=None)
df.head(7)

  return func(*args, **kwargs)


Unnamed: 0,0
0,"8 ICO f INVOICE NO B 07025 SOLO TO The Council for Tobacco Research DATE 7 13 67 crcoiV do6c 033B XEBXiss : rr 633 Third Avenue Net 30 Days New York, New York 10017 Attention: Dr R C Hockett DESTINATION CUSTOMER S ORDER NO B L NO BNPI 19 5 Subst i tut e O L WOT F O B CAR INTLS ft NO SHIPPIP IO POINT PNL ORDER NO DATE SHIPPED COMMODITY Project Y 1 0153 Research on Methods of Measuring the Deposition of Inhaled Cigarette Smoke in Dogs CODE For Services Rendered June, 1967 Approved Pt r 0 9 MAKE CHECK PAYABLE TO: CASHIER, PACIFIC NORTHWEST LABORATORIES REMIT TO CASHIER, PACIFIC NORTHWEST LABORATORIES P O BOX 999, RICHLAND, WASH 99352 QUANTITY rt PRICK AMOUNT 2,755 55 PLEASE RETURN ONE COPY OF THIS INVOICE WITH REMITTANCE Q25420 CTR SP FILES 002572"
1,"Philip Morris USA BUSINESS fVOUCW herT PHILIP MORRIS USA VOUCHER DATE: Oct 14 1996 PLEASE READ INSTRUCTIONS ON PAGE 2 PAYEE: CEHHT Inc SPECIAL INSTRUCTIONS: PLEASE SEND THIS 6001 Mohtfose Boid Suite 4o 2a XI V CHECK TO MARGARET OPOCENSKY R D A 2 ROCk ile, lD20852 FOR TRANSMITTAL COST CENTER EXPLANATION OF PAYMENT TO BE CHARGED S AMOUNT Env Tob Smoke Literature Database for July 1 31,1996 2R1 5,000 00 Env Tob Smoke Literature Database for Auq 1 31,1996 2R1 SI 5 000 00 Total S20 000 00 1 PREPARED BY: Margaret Opocensky CHECK DATE DUE: Oct 18, 19096 LOCATION: R D A 2 Extension: 3882 APPROVED: M R A Carchman FOR ACCOUNTING USE ONLY VENDOR: INVOICE NO : gl account amount Q 111 V vlvL 1 ivii INVOICE AMOUNT: INVOICE DATE: ACTION: i 1 2 1 3 1 4 1 P O NBR Q : 5 1 DUE DATE Q : CHECK DESCRIPTION 1 : 6 1 7 1 a 1 9 1 10 1 ACCTG AUDIT: ii 1 12 1 13 1 ACCTG APPROVAL: 14 1 DATA ENTRY: 15 1 FORWARD THIS COPY TO: CASH DISBURSEMENTS FINANCE CENTER DOC CODE: V0120 2063610122"
2,"the commercial 211 EAST 4 3RD STREET NEW YORK N Y TO: Mr Neil Holbert Philip Morris, USA 100 Park Avenue New York, New York 10017 Date Invoice Our Job Ycur P O : 10017 945 9040 2 28 78 2758 3050 INVOICE TITLE: Cigarette Tracking 1978 FEE: 111,300 00 5 BALANCE DUE: First Partial Payment 37,100 00 P tate no: all cutmcnrjirss sna ISM cotit art stated fct :ix monr i a I tit zcrroie: cn cf tr t jfwCV era tr en discord li zr leze i written rscutit tr a: tr sy zt sene to you : received MARKETING ECONOMIC AND MOTIVATION RESEARCH 2042525639"
3,"86 MAPLE AVE NEW CITY, N Y, 10956 5036 914 634 1331 800 767 7967 AX:914 634 9618 Mr Andrew Pasheluk Lorillard Tobacco Co 714 Green Valley Rd Box 10529 Greensboro NC 27408 7018 received jul i MP1D INVOICE : 99065516 INV DATE: 07 01 99 ACCOUNT 5516 BILLING PERIOD: 06 01 99 to 06 30 99 TERMS: PCS DESCRIPTION PIECE RATE 1 55 OZ RATE 0 00 TOTAL 22 1 OZ MAIL PIECES 34 10 34 10 80 2 OZ MAIL PIECES 124 00 0 00 124 00 102 TOTAL 158 10 0 00 158 10 lUAL SERVICE CHARGE 200 00 TOTALZOSSUBBS CHARGES SUMMARY OF ACCOUNT PAST DUE PAYMENTS CREDITS CURRENT BILLING FINANCE CHARGE 6 20 6 20 358 10 0 00 358 10 PAYMENT DUE UPON REC E INCLUDE YOUR ACCOUNT 5516 FOR PROPER CREDIT CO Os Os CSJ O J o PAST DUE BALANCES ARE SUBJECT TO A 1 5 MONTHLY FINANCE CHARGE"
4,"MANAGEMENT SCIENCE ASSOCIATES, INC 6565 Penn Avenue at Fifth Pittsburgh, PA 15206 4490 4121 362 200C nr TO: Mr Odvc TenBarge PHILIP MORRIS, U S A 120 Park Avenue , flew York, NY 10017 INVOICE SEND REMITTANCE TO: Management Science Associates, In P O Box 400155 W PiltsOuigh Pennsylvania 15268 0155 PLEASE REFER tg OUR I WCHGEJiMI OR RETURN IHVOlCEXOPY WMEMR AMOUNTS NOT RECEIVED WITHIN 30 DAYS OF INVOICE DATE ARE SUBJECT TO 1 5 MONTHLY SERVICE CHARGE 18 PER ANNUM INVOICE Nuue fT 9210153 iMvoict o rc October 26, 1992 l NT AUTHOR 2ATION NUMBER FEDERAL ID NO 25 1126415 : ItebmT V NET 10 DA DESCRIPTION AMOUNT ALES , INFORMATION SVCS DATANET USAGE 0TH7O14 September 1992 Data Month TOTAL AMOUNT DUE 8,579 00 30 13433 1 :i Rose F 2,359 00 30 13443 s 3,845 00 20 18444 s 423 00 30 13445 613 00 30 13446 s 1,134 00 30 18447 s 155 00 Berarflelli 2072957946"
5,"Invoice No Hr Andy Pasheluk Brand Manager Lorillard Company 666 Fifth Avenue Nev York, New York 10103 Stephan l ton An A Design Consultant LION ARTS, INC 1194 HasboroMtoVtlaNo tl Hfeboro B ach Fla 33062 005 421 4064 PO Box 32 Southampton N Y 11966 516 728 2430 Dale January 6, 1986 Our Job No Your P O No Terms Not thirty days fagjCMe Design Program for the Nev True King Size Filter, Ring Size Menthol, 100 a Filter n d 100 Monthol Packs and Cartona Out of pocket expenses C 76888 Prl,it C 1 76538 76268 76048 76786 76774 Photo Lettering, Inc i Noi 81623 as per enclosed invoices 169 41 64 95 185 11 37 35 37 80 38,42 Cowp Shop, Inc 1 K0114496 14428 14526 6,593 00 3,910 00 Zx2A4t50 20 handling chargei Total 533 12 203 51 1 8 217 50 18,954 13 3 790 82 22,744 95 TC6T2068"
6,"0S O3 1S94 09:39 a z as 2127532304 R PROPERTIES INT L gjk LEO BURNETT U 8A API ovvxm o lq hjuctt commny wt P ADVERTISING 26 WEST WACKEJl CTIVR CHICAGO LLI 0 8 80641 3 J 220 686 SPACE ORDER NO 10PK GAMED 12 MAGAZINE Data 06 11 93 PAGE 1 Advartlaar PHILIP MORRIS INC Contract Year 09 01 93 THRU C8 31 S4 Discount Laval Last tflsney commlsalon 15 CO on you Cash dlaoount 2 00 on nat 10 DAYS To tha publlahar of: GAME DAY MAGAZINE MR GERALD GERARDIELLO NATIONAL SALES MANAGER NATIONAL FOOTBALL LEAGUE 410 PARK AVENUE NEW YORK NY 10022 DATE SPACE CCLOR BLEED MAIL INVOICE TO: Accounting Oapartmont Saotlon a PRODUCT NOTES VENDOR COST SEP 93 10 SACK COVER 4 C BLEED 10 MARLB0R0 01 161,840 00 POS ED: 1993 SEASON NOTE 01 GAME DAY PACKAGE INCLUDESl OPFICIAL NFL SUPERSOWL XXVI2I GAME DAY FRCGRAK BACK COVER HAWAII PRO BOWL GAMS DAY PROGRAM SACK COVER TEAM NFL BACK COVER PLEASE SIGN AND RETURN THE ATTACHED DUPLICATE THU l ACt 0 Dt II tUIJCCT TO III CCHOTrTlONt HCMON AHO THE AAAA eoaomoia co aicht mi THU I ACt 0 3tA It IUB ICT TO THI ALIlltMIVI MMUIf T TC l THAT AATI8 mCalOiftO OlfCO HTi TC, Am W C0 0 MtTV WfTM AWV QOVtM yinTALiV M oit3 Mi Hum on moulationi m imict s g tni tun 1 I r aCC onotn, AXr DI ft UAZt ct A r M CCI CHAMID wh h I U fll zt f I A 0 Ali MTf CTC CDZZZJ UVJl PUF ur rr t: r s r iur cvf v r : W A v ALL V iV CL V if uOi LL I CUYWfMC UPPu i OiH 0 r aw AU tenu ezv mv i LEO BURNETT U S Ap ORIGINAL 2041597671"


# Setting up SPACY Pipeline

Now we have text information from the xml files. Next step is to extract the entities.
Using SPACY for extracting relevant information

In [17]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
# Detecting date and filling it in date dictionary, directly puttin in a 
# series won't work because in case of multiple entry as list it will take only
# the last entry:
date_dict = {}
#invoice_no = {}
customer_name_dict = {}
#total_amt = {}
for i in range(0, df.shape[0]):
  date_list = []
  name_list = []
  text = df.iloc[i,:]
  doc = nlp(str(text))
  for ent in doc.ents:
      if ent.label_ == 'DATE':
          date_list.append(ent.text)
          #date_dict[i] = ent.text
          #df['Date'] = ent.text
      elif ent.label_ == 'PERSON':
          name_list.append(ent.text)

  date_dict[i] = date_list        
  customer_name_dict[i] = name_list  

# Unfiltered Results:

Code above give us the results in the unfiltered form (for ex: date may contain date and some other numbers tagged as date). Now we need to apply several other techniques to filter the results, one is discussed below.

In [19]:
import pprint
#pprint.pprint(customer_name_dict)
pprint.pprint(date_dict)

{0: ['07025', '30 Days', 'June, 1967'],
 1: ['THIS 6001', 'July 1', 'Oct 18, 19096'],
 2: ['2758 3050', '1978'],
 3: ['10956 5036',
     '1331 800 767',
     '9618',
     '10529',
     '27408 7018',
     '07 01 99',
     '06 01 99 to 06 30 99'],
 4: ['15206 4490',
     '15268 0155',
     '30 DAYS OF INVOICE DATE ARE SUBJECT',
     'October 26, 1992',
     'September 1992',
     '30 13433 1',
     '423 00 30',
     '13445',
     '13446',
     '1,134 00'],
 5: ['10103', '2430', 'January 6, 1986', 'thirty days', '81623', '14428 14526'],
 6: ['O3 1S94', '2127532304', '09 01 93', '1993', '2041597671'],
 7: ['20044', '19750 3', 'May 19, 1995', 'January 1995'],
 8: ['24 9', 'FISCAL YEAR'],
 9: ['10019', '30 DAYS'],
 10: ['1201',
      '10178',
      '352 0102',
      'January 9, 1991',
      'January February',
      '2029377295'],
 11: ['40202', '1981'],
 12: ['November 4, 1996', 'November', '2216'],
 13: ['71 0167'],
 14: ['October 10, 1991'],
 15: ['5 1990'],
 16: ['18674', '23831'],
 17: 

# POC for filtering results:

Following code shows how by applying pipeline in series a result can be filtered out. Several random values assigned as dates in the previous result are assigned as 'CARDINAL' value here. It shows how a result can be filtered using the pipeline in series. Here we have used same pipeline for better result a different pipeline (different) model should be used.

In [20]:
stopper = 0
for key in date_dict:
    stopper += 1
    print(key)
    print(date_dict[key])
    for iter in range(0,len(date_dict[key])):
        token = date_dict[key][iter]
        doc = nlp(str(token))
        for ent in doc.ents:
          print(ent.text, ent.label_)
    if stopper > 2:
      break
    print('******************************************************************************************')

0
['07025', '30 Days', 'June, 1967']
07025 DATE
30 Days DATE
June, 1967 DATE
******************************************************************************************
1
['THIS 6001', 'July 1', 'Oct 18, 19096']
6001 DATE
July 1 DATE
19096 DATE
******************************************************************************************
2
['2758 3050', '1978']
2758 3050 DATE
1978 DATE


# Matching the custom patterns

Loading the spacy Matcher and making a matcher object from it. The pattern to be matched can be added to 'matcher' and the 'matcher' object can be applied to the spacy doc (containing thet text data) inorder to match with the customized patterns.

In [21]:
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True)
df[0].astype('string')
print(type(df.iloc[5][0]))

<class 'str'>


# Call_back function

In [22]:
date_pattern3 = [{'LOWER':{"REGEX":'^jan|^feb|^mar|^apr|^may|^jun|^jul|^aug|^sep|^oct|^nov|^dec'}},{'LOWER':{'REGEX':'\d{1,2}'}},{'LOWER':{'REGEX':'\d{0-4}'}}]
date_pattern4 = [{'LOWER':{"IN":['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']}}]


matcher.add('Custom_Date', [date_pattern4])
doc = nlp(df.iloc[i][0])
matches = matcher(doc)
matches

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    #print(match_id, string_id, start, end, span.text)
    print(start, end, span.text)

In [23]:
date_pattern1 = [{'LOWER':{"REGEX":'(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'}},{}]
                           

def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)
    
matcher = Matcher(nlp.vocab)
matcher.add('Date_Cust2', [date_pattern1], on_match = callback_method)
doc = nlp(df.iloc[5][0])
matcher(doc)    

Hfeboro B
January 6


[(3056582165036000932, 36, 38), (3056582165036000932, 62, 64)]

**Hope this helps you to start with the data set and gives you an idea about what to do and how to do. Do share your amazing ideas and methods for entity extraction from such files.**