<a href="https://colab.research.google.com/github/YashviP/NLP-WITH-SPACY/blob/main/Rule_Based_matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rule Based matching 

```
Rule-based systems are a good choice if there’s a more or less finite number of
examples that you want to find in the data, or if there’s a very clear, 
structured pattern you can express with token rules or regular expressions.
```
source - https://spacy.io/usage


# Token Matcher 

In [None]:
!python3 -m spacy download en_core_web_md

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_md")
matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": "feb"}, {"IS_DIGIT": True}, {"IS_PUNCT": True}, {"IS_DIGIT": True}]
matcher.add("DATE", [pattern])
doc = nlp("Your expected delivery date is feb 8 , 2021")

matches = matcher(doc)
for match_id, start, end in matches:
 print(doc[start:end])

feb 8 , 2021


# Regex Matcher

In [None]:
import spacy
import re

nlp = spacy.load("en_core_web_md")
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"U[.]?S[.]?"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        print("Found match:", span.text)

Found match: U.S.
Found match: US


# Phrase Matcher
used when list of terminologies to be matched 

In [None]:
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_md")
matcher = PhraseMatcher(nlp.vocab)

#create the list of words to match
lang_list = ['Python','C++','Java']

#obtain doc object for each word in the list and store it in a list
patterns = [nlp(lang) for lang in lang_list]
#add the pattern to the matcher
matcher.add("PROGRAMMING_LANGUAGE", patterns)
#process some text
doc = nlp("Python requires less typing, provides new libraries, fast prototyping, and several other new features. C++ as of today in its efficiency, speed, and memory make it widely popular among coders. Java is platform-independent")
matches = matcher(doc)
for match_id, start, end in matches:
 span = doc[start:end]
 print(span.text)

Python
C++
Java


## Extract Information from Invoice

In [None]:
!apt install build-essential libpoppler-cpp-dev pkg-config python3-dev

In [None]:
!pip install pdftotext



In [None]:
import pdftotext

# Load your PDF
with open("/content/invoice-test.pdf", "rb") as f:
    pdf = pdftotext.PDF(f)


In [None]:
for page in pdf:
    print(page)

                                          ABC technologies    123 Any Street
                                               1234567890        Delhi, Delhi
                                                                     112239
                                                                         India
Billed To    Date of Issue Invoice Number                 Amount Due (USD)
Yashvi Patel
ABC
             09/06/2021
             Due Date
                           0000001
                                                     $1,900.00
Street A1
Delhi        09/07/2021
121150
India
Description                               Rate            Qty      Line Total
Test Item                            $1,900.00              1      $1,900.00
                                                Subtotal            1,900.00
                                                    Tax                   0.00
                                                   Total            1,900.00
                   

In [None]:
import re

In [None]:

amount_due=r"Amount[ ]Due[ ]\(USD\)[ ]*\$\d*[ .,\d]*\d"
re.findall(amount_due,"Amount Due (USD)                    $1,837.83")

['Amount Due (USD)                    $1,837.83']

In [None]:
date=r"(0[0-9]|1[0-9]|2[0-9]|3[0-1])[/](0[0-9]|1[0-2])[/]([1-2][0-9][0-9][0-9])"
re.findall(date," 06/09/2021  07/09/2021")

[('06', '09', '2021'), ('07', '09', '2021')]

In [None]:
invoice_number="[0-9]{7}"
re.findall(invoice_number,"hello      1234567     6575812      ")

['1234567', '6575812']

In [None]:
import re
import spacy
from spacy.tokens import Span
from spacy.lang.en import English
from spacy.language import Language


@Language.component('regex_matcher')
def regex_matcher(doc):
  expressions={
      "amount_due": re.compile(r"Amount[ ]Due[ ]\(USD\)[ ]*\$\d*[ .,\d]*\d"),
      "date": re.compile(r"(0[0-9]|1[0-9]|2[0-9]|3[0-1])[/](0[0-9]|1[0-2])[/]([1-2][0-9][0-9][0-9])"),
      "invoice_number": re.compile(r"[0-9]{7}")                
  }
  spans=[]
  for label, expression in expressions.items():
      for match in re.finditer(expression, doc.text):
          start, end = match.span()
          entity = doc.char_span(start, end, label=label)
          if entity:
            spans.append(entity)

  doc.ents=list(doc.ents)+spacy.util.filter_spans(spans)
  return doc
  

In [None]:
nlp = spacy.load('en_core_web_md', disable = ['ner'])
nlp.add_pipe('regex_matcher')
doc = nlp(pdf[0])

for ent in doc.ents:
    print(ent, ent.label_)

09/06/2021 date
0000001 invoice_number
09/07/2021 date
Amount Due (USD)              $1,900.00 amount_due


In [None]:
from spacy import displacy
displacy.render(doc, style='ent',jupyter=True)