In [1]:
import cv2
import numpy as np
import pytesseract
import re
import os
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [2]:
# OpenCv function to get grayscale image
def get_grayscale(image):                    
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  #This Function in OpenCV2 Provides to get GrayScale 

# noise removal in Given Image
def remove_noise(image):
    return cv2.medianBlur(image,5) #Removing Noice in the Picture is Way more Important 
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotation = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotation

#template matching
#def match_template(image, template):
  #return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [3]:

def extract_text_from_image(image_path,filename):
    image = cv2.imread(image_path)
    # convert to grayscale
    # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # # gray = cv2.resize(gray, None, fx=.5, fy=.5)
    
    # # apply Gaussian blurring to smooth out the image
    # # thresh = cv2.GaussianBlur(gray, (7,7), 0)
    # thresh = cv2.medianBlur(gray,9)

    # # apply Otsu's thresholding to detect text regions
    # # thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 5, 11)
    # # thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # # apply dilation and erosion to remove noise
    # kernel = np.ones((2,2), np.uint8)
    # # kernel1 = np.ones((2,2), np.uint8)
    # thresh = cv2.dilate(thresh, kernel, iterations=1)
    # thresh = cv2.erode(thresh, kernel, iterations=1)

    gray = get_grayscale(image) #Converting Image to a GrayScale Image
    # thresh = thresholding(gray) #Binary Thresholding Apply for the GrayScale Image
    b=remove_noise(gray)
    # openin = opening(b)
    # cannyimg = canny(openin) #Canny Filtering used for the GrayScale Image

    finali = np.hstack((gray,b)) #stacking images side-by-side
    cv2.imwrite("./generated/"+filename,finali)

    # set tesseract OCR to recognize handwritten text
    custom_oem_psm_config = r'-l eng+sin --psm 6'

    # run tesseract OCR on the image
    text = pytesseract.image_to_string(b,config=custom_oem_psm_config)
    # print(text)
    return text



In [4]:
def extract_transaction_info(text):
  # extract transaction information from the text
  transaction_info = {}

  # extract date
  date_pattern = r'(\d{2}[/,-]\d{2}[/,-]\d{4})|(\d{2}[/,-]\d{2}[/,-]\d{2})'

  date_match = re.search(date_pattern, text,re.IGNORECASE)
  if date_match:
      for group in date_match.groups():
          if group:
              transaction_info["Date"] = group
              break

  # extract total amount
  total_pattern = r"(net TOTAL|Grand Total|OTAL|net amount|amount|එකතභව)\D*(\d+[.,]\d+)"
#   total_pattern = r'NET TOTAL (\d+\.\d+)'
  total_match = re.search(total_pattern, text,re.IGNORECASE)
  if total_match.group(1):
      print(total_match.group(1))
  if total_match.group(2):
      print(total_match.group(2))
      transaction_info["Amount"] = total_match.group(2)

      
  return transaction_info


In [59]:
import re

text1 = """
oe “
ye
Porera And Sons
‘162/13, NEW KANDY
OAL WELIVE. A, MALABF
el O11, 41137; Gigen 6AM lose 9PM
Order TaneAway  Customici.
Bill No 2176-337 /82 Cashier: 2G00.PNS
[tem Description
— Oty U/Price = % ~~ Discount Amount
Pol Rotty
1.0060 65.00 8% 0.00 35.00
CB -Dhall VWadei
melee) 90.00 6% 0 OO 903.00
| Fish Chine. Rol!
1.000 160") OM 0.60 166.00
tame June
~~ 2.00. 146.00 O% 0.00 140.00
Net Total: 475.00
Cash : ~ 500.00
Cash Balance : 25.00
1 24503 : 72-176-01 :
pi Copy) ;
' 7
com
"""
text2= """
Se
al eras Y
CARDILLS FOO CITY
Nalkloatbke O2
 O14-2413571
ee/ 10/202 17sb4:29 HADUWANTHT Nos 371
NO ITEM ary PRICE AMOUNT
1 WUNCHEE POTATO CRACKER  -
SCIQGii 1.060 230.00 230,00
Net Total 740,00
CASH 1,000.00
‘Balance 770.00
“aa :
Time End 17254235
eecnenne na THPORTANT NOTICE-~-~-------- :
_ 1 disc ar Ys return
7 days to
Sh . |
a
ee
|
ae
"""
import re

import re



pattern = r'\b(?:Rs|No|Price|Item|Qty|Amount|Discount|Product|Description)\b.*$'

match = re.search(pattern, text1, re.IGNORECASE | re.MULTILINE)

if match:
    print("Matched Line:", match.group())
else:
    print("No match found.")

                                          
                                          


Matched Line: No 2176-337 /82 Cashier: 2G00.PNS


In [118]:
import re

def categorize_lines(text):
    # Convert the text to uppercase
    text = text.upper()

    # Regular expression pattern to detect the first line
    pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*?$'
    
    
    # Find all lines matching the specified pattern
    matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
    
    
    
    
    # Find the first line matching the specified pattern
    # first_line_match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
    # first_line_match = re.search(pattern, text, re.MULTILINE)
    # 
    # print("first_line_match ",first_line_match )
    # Check if the text matches the pattern
    if matches:
        
        # Store indices of the first two matches
        indices = []
        for match in matches[:2]:
            lines = text.split('\n')
            index = lines.index(match.strip())
            indices.append(index)
        
        matches_length = len(matches)
        print("matches_length",matches_length)
        
        
        if(len(matches)>1):
            print("Indices of the first two matching lines:", indices[0],indices[1])
            if ((indices[0]+1) == indices[1]):
                index=indices[1]
            else:
                index=indices[0]
        else:
            index=indices[0]
        
        
        print("indessss ",index)
        
        
        lines = text.split('\n')
        # index = lines.index(first_line_match.group().strip())  # Get the index of the detected line
        lines_after_detected = lines[index+1:index+3]  # Get the two lines immediately after the detected line
        print("lines_after_detected",lines_after_detected)
        # lines_after_detected ['1 WUNCHEE POTATO CRACKER  -', 'SCIQGII 1.060 230.00 230,00']
        # Check if the first line of the lines_after_detected fits Category One criteria
        first_line = lines_after_detected[0].strip()
        print("first_line",first_line)
        if re.match(r'^[0-9A-Z/.,\'\s;:-][A-Z/.,\'\s;:-]+$', first_line) :
            return "Category One"
        else:
            return "Category Two"
    else:
        return "No match found."

# Example usage:
text2 = """
oe “
ye
Porera And Sons
‘162/13, NEW KANDY
OAL WELIVE. A, MALABF
el O11, 41137; Gigen 6AM lose 9PM
Order TaneAway  Customici.
Bill No 2176-337 /82 Cashier: 2G00.PNS
[tem Description
— Oty U/Price = % ~~ Discount Amount
Pol Rotty
1.0060 65.00 8% 0.00 35.00
CB -Dhall VWadei
melee) 90.00 6% 0 OO 903.00
| Fish Chine. Rol!
1.000 160") OM 0.60 166.00
tame June
~~ 2.00. 146.00 O% 0.00 140.00
Net Total: 475.00
Cash : ~ 500.00
Cash Balance : 25.00
1 24503 : 72-176-01 :
pi Copy) ;
' 7
com
"""
text1="""
Se
al eras Y
CARDILLS FOO CITY
Nalkloatbke O2
 O14-2413571
ee/ 10/202 17sb4:29 HADUWANTHT Nos 371
NO ITEM ary PRICE AMOUNT
1 WUNCHEE POTATO CRACKER  -
SCIQGii 1.060 230.00 230,00
Net Total 740,00
CASH 1,000.00
‘Balance 770.00
“aa :
Time End 17254235
eecnenne na THPORTANT NOTICE-~-~-------- :
_ 1 disc ar Ys return
7 days to
Sh . |
a
ee
|
ae"""

category = categorize_lines(text1)

print("Category:", category)


matches_length 1
indessss  7
lines_after_detected ['1 WUNCHEE POTATO CRACKER  -', 'SCIQGII 1.060 230.00 230,00']
first_line 1 WUNCHEE POTATO CRACKER  -
Category: Category One


In [104]:
# Regular expression pattern to detect the first line
pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*$'

text = """
oe “
ye
Porera And Sons
‘162/13, NEW KANDY
OAL WELIVE. A, MALABF
el O11, 41137; Gigen 6AM lose 9PM
Order TaneAway  Customici.
Bill No 2176-337 /82 Cashier: 2G00.PNS
[tem Description
— Oty U/Price = % ~~ Discount Amount
Pol Rotty
1.0060 65.00 8% 0.00 35.00
CB -Dhall VWadei
melee) 90.00 6% 0 OO 903.00
| Fish Chine. Rol!
1.000 160") OM 0.60 166.00
tame June
~~ 2.00. 146.00 O% 0.00 140.00
Net Total: 475.00
Cash : ~ 500.00
Cash Balance : 25.00
1 24503 : 72-176-01 :
pi Copy) ;
' 7
com
"""
# Find the first line matching the specified pattern
first_line_match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)

# Get the full line that matches the pattern
full_line = first_line_match.group()
lines = text.split('\n')
index = lines.index(first_line_match.group().strip())  # Get the index of the detected line
lines_after_detected = lines[index+1:index+3]  # Get the two lines immediately after the detected line
        
print("index ", index) 
print("full_line ", full_line) 
print("lines_after_detected",lines_after_detected)



index  9
full_line  [tem Description
lines_after_detected ['— Oty U/Price = % ~~ Discount Amount', 'Pol Rotty']


In [106]:
import re

pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*?$'
text = """
oe “
ye
Porera And Sons
‘162/13, NEW KANDY
OAL WELIVE. A, MALABF
el O11, 41137; Gigen 6AM lose 9PM
Order TaneAway  Customici.
Bill No 2176-337 /82 Cashier: 2G00.PNS
[tem Description
— Oty U/Price = % ~~ Discount Amount
Pol Rotty
1.0060 65.00 8% 0.00 35.00
CB -Dhall VWadei
melee) 90.00 6% 0 OO 903.00
| Fish Chine. Rol!
1.000 160") OM 0.60 166.00
tame June
~~ 2.00. 146.00 O% 0.00 140.00
Net Total: 475.00
Cash : ~ 500.00
Cash Balance : 25.00
1 24503 : 72-176-01 :
pi Copy) ;
' 7
com
"""

# Find all lines matching the specified pattern
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
print(matches)
# Iterate through the matches and get lines immediately after each match
lines_after_matches = []
for match in matches:
    lines = text.split('\n')
    index = lines.index(match.strip())
    lines_after_match = lines[index + 1: index + 3]
    lines_after_matches.append(lines_after_match)

print("Matching lines:")
for i, match in enumerate(matches):
    print(f"Match {i + 1}: {match}")
    print(f"Lines after match {i + 1}: {lines_after_matches[i]}")


['[tem Description', '— Oty U/Price = % ~~ Discount Amount']
Matching lines:
Match 1: [tem Description
Lines after match 1: ['— Oty U/Price = % ~~ Discount Amount', 'Pol Rotty']
Match 2: — Oty U/Price = % ~~ Discount Amount
Lines after match 2: ['Pol Rotty', '1.0060 65.00 8% 0.00 35.00']


In [107]:
import re

pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*?$'
text = """
oe “
ye
Porera And Sons
‘162/13, NEW KANDY
OAL WELIVE. A, MALABF
el O11, 41137; Gigen 6AM lose 9PM
Order TaneAway  Customici.
Bill No 2176-337 /82 Cashier: 2G00.PNS
[tem Description
— Oty U/Price = % ~~ Discount Amount
Pol Rotty
1.0060 65.00 8% 0.00 35.00
CB -Dhall VWadei
melee) 90.00 6% 0 OO 903.00
| Fish Chine. Rol!
1.000 160") OM 0.60 166.00
tame June
~~ 2.00. 146.00 O% 0.00 140.00
Net Total: 475.00
Cash : ~ 500.00
Cash Balance : 25.00
1 24503 : 72-176-01 :
pi Copy) ;
' 7
com
"""

# Find all lines matching the specified pattern
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)

# Store indices of the first two matches
indices = []
for match in matches[:2]:
    lines = text.split('\n')
    index = lines.index(match.strip())
    indices.append(index)

print("Indices of the first two matching lines:", indices)


Indices of the first two matching lines: [9, 10]


In [165]:
import re

# Regular expression pattern
pattern = r'^[0-9A-Z/.,\'\s;:-]{4}[A-Z/.,\'\s;:-]+$'

# Example lines to match



lines = [
    "WUNCHEE POTATO CRACKER  - ",
    "6222 JoVEES FACE WASH TEA TREE ML - "
]

for line in lines:
    if re.match(pattern, line, re.IGNORECASE):
        print(f"Matched: {line}")
    else:
        print(f"Not Matched: {line}")


Matched: WUNCHEE POTATO CRACKER  - 
Matched: 6222 JoVEES FACE WASH TEA TREE ML - 


In [162]:
import re

text = """
y N).227, Anagarika Dnarmapale Mw,Nupe,, Matara U'70 108444 / 0112 303 500 / Store Code: SMMR 31-10-2022 12:24:54 €:116296 NP R:2022505 -n item Price Qty Amount | 6117:J0VEES FACE WASH TEA TREE 120ML 1,685.00 
"""

# Regular expression pattern to match words containing "ML" or "KG"
pattern = r'\b\w+(?:ML|KG)\b'

# Remove words containing "ML" or "KG"
result = re.sub(pattern, '', text)

print(result)



y N).227, Anagarika Dnarmapale Mw,Nupe,, Matara U'70 108444 / 0112 303 500 / Store Code: SMMR 31-10-2022 12:24:54 €:116296 NP R:2022505 -n item Price Qty Amount | 6117:J0VEES FACE WASH TEA TREE  1,685.00 


In [174]:
import re

sentence = "| 6117:JoVEES FACE WASH TEA TREE"

# Regular expression pattern to match the first word if it starts with a symbol
pattern = r'^[^\w]+'

# Remove the first word if it starts with a symbol
result = re.sub(pattern, '', sentence).strip()
# Regular expression pattern
pattern = r'^[0-9A-Z/.,\'\s;:-]{4}[A-Z/.,\'\s;:()&-]+$'
# print(result)
lines = [
    "WUNCHEE POTATO CRACKER  - ",
    "YELLOW RICE & CURRY  HICKEN (M) ",
]

for line in lines:
    if re.match(pattern, line, re.IGNORECASE):
        print(f"Matched: {line}")
    else:
        print(f"Not Matched: {line}")


Matched: WUNCHEE POTATO CRACKER  - 
Matched: YELLOW RICE & CURRY  HICKEN (M) 


In [184]:
import re

text = """
Se
al eras Y
CARDILLS FOO CITY
Nalkloatbke O2
 O14-2413571
ee/ 10/202 17sb4:29 HADUWANTHT Nos 371
NO ITEM ary PRICE AMOUNT
1 WUNCHEE POTATO CRACKER  -
SCIQGii 1.060 230.00 230,00
Net Total 740,00
CASH 1,000.00
‘Balance 770.00
“aa :
Time End 17254235
eecnenne na THPORTANT NOTICE-~-~-------- :
_ 1 disc ar Ys return
7 days to
Sh . |
a
ee
"""

# List of words to check for
words_to_check = ["Net Total", "CASH", "Balance"]

# Regular expression pattern to match lines containing specified words
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_check) + r')\b'

# Split the text into lines and find the indexes of lines matching the pattern
lines = text.splitlines()
matching_line_indexes = [index for index, line in enumerate(lines) if re.search(pattern, line)]

print("Indexes of lines containing specified words:", matching_line_indexes)


Indexes of lines containing specified words: [10, 11, 12]


In [189]:
text = """
Se
al eras Y
CARDILLS FOO CITY
Nalkloatbke O2
 O14-2413571
ee/ 10/202 17sb4:29 HADUWANTHT Nos 371
NO ITEM ary PRICE AMOUNT
1 WUNCHEE POTATO CRACKER  -
SCIQGii 1.060 230.00 230,00
Net Total 740,00
CASH 1,000.00
‘Balance 770.00
“aa :
Time End 17254235
eecnenne na THPORTANT NOTICE-~-~-------- :
_ 1 disc ar Ys return
7 days to
Sh . |
a
ee
"""

# Define a translation table to replace specified characters with spaces
translation_table = str.maketrans({"'": ' ', '"': ' ', '`': ' ', '~': ' ',"_":" ","-":" ",":":" ", '|': ' ', '\\': ' ', '‘': ' ', '“': ' ', '–': ' '})

# Use translate() method to replace characters with spaces
cleaned_text = text.translate(translation_table)

print(cleaned_text)



Se
al eras Y
CARDILLS FOO CITY
Nalkloatbke O2
 O14 2413571
ee/ 10/202 17sb4 29 HADUWANTHT Nos 371
NO ITEM ary PRICE AMOUNT
1 WUNCHEE POTATO CRACKER   
SCIQGii 1.060 230.00 230,00
Net Total 740,00
CASH 1,000.00
 Balance 770.00
 aa  
Time End 17254235
eecnenne na THPORTANT NOTICE              
  1 disc ar Ys return
7 days to
Sh .  
a
ee


In [169]:
sentence = "Hello, 0123456789! This is a test sentence with 0 and other digits."

# Replace all occurrences of "0" with "O"
modified_sentence = sentence.replace("0", "@")

print(modified_sentence)


Hello, @123456789! This is a test sentence with @ and other digits.


In [135]:
import re

pattern = r'^[a-zA-Z0-9]+([a-zA-Z]+)*$'

# Test strings
sentences = [
    "123 ValidWord SecondWord ThirdWord",
    "Invalid@Word SecondWord ThirdWord",
    "ValidWord SecondWord With/Symbols;",
    "Word123 Another-Word Word/With/Symbols;",
    "ValidWord SecondWord ThirdWord-With/Symbols;"
]

for sentence in sentences:
    words = sentence.split()
    if re.match(pattern, sentence) and all(re.match(r'^[A-Z/.,\'\s;:-]+$', word) for word in words[1:]):
        print(f"'{sentence}' matches the pattern.")
    else:
        print(f"'{sentence}' does not match the pattern.")


'123 ValidWord SecondWord ThirdWord' does not match the pattern.
'Invalid@Word SecondWord ThirdWord' does not match the pattern.
'ValidWord SecondWord With/Symbols;' does not match the pattern.
'Word123 Another-Word Word/With/Symbols;' does not match the pattern.
'ValidWord SecondWord ThirdWord-With/Symbols;' does not match the pattern.


In [56]:
folder = "./bills/"

filename='20221022_Food.jpg'
extext = extract_text_from_image(folder+filename,filename)
# print(extext)

category = categorize_lines(extext)

print("Category:", category)

first_line 1 WUNCHEE POTATO CRACKER  -
Category: Category One


In [None]:
extract_transaction_info(extext)

In [6]:
folder = "./bills/"

for filename in os.listdir(folder):
    print(filename)
    extext = extract_text_from_image(folder+filename,filename)
    x=filename.split('.')
    f = open('./generated/'+x[0]+'.txt', "w",encoding="utf-8")
    f.write(extext)
    f.close()


20220821_Food.jpg


FileNotFoundError: [Errno 2] No such file or directory: './generated/20220821_Food.txt'

In [None]:
import matplotlib.pyplot as plt
img = cv2.imread('./bills/20220821_Food.jpg')
# img = get_grayscale(img)
plt.imshow(img)

In [None]:
imgbox=pytesseract.image_to_boxes(img)
#Parameters to find the Dimention of the Images
imgH, imgW,_ = img.shape

#Iterative Process for Create boxes in the Image
for boxes in imgbox.splitlines():
     boxes = boxes.split(' ')
     x,y,w, h = int(boxes[1]),int(boxes[2]),int(boxes[3]),int(boxes[4])
     cv2.rectangle(img, (x,imgH-y) , (w,imgH-h), (0,0,255),3)
plt.imshow(img) 


#### create a python model for extract transaction information from a bank sms among others

- First, you will need to obtain a dataset of bank SMS messages that contain transaction information. You can either create this dataset yourself by manually collecting and annotating SMS messages, or you can try to find a pre-existing dataset online.

- Next, you will need to preprocess the dataset by cleaning and normalizing the SMS messages. This may involve removing special characters, lowercasing all the text, and splitting the messages into individual words.

- Once you have a cleaned and normalized dataset, you can start building your model. One approach you could take is to use a sequence labeling model, such as a Conditional Random Field (CRF), to identify and classify the various pieces of transaction information within the SMS messages.

- To train your model, you will need to split your dataset into a training set and a test set. You can then use the training set to fit your model and the test set to evaluate its performance.

- Finally, you can use your trained model to extract transaction information from new, unseen bank SMS messages. You can do this by passing the SMS messages through your model and using the predicted labels to extract the relevant information.

In [None]:



import nltk

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

sms_messages = ["ATM Withdrawal Rs 30000.00 From A/C No XXXXXXXXXX400. Balance available Rs 5963.81 - Thank you for banking with BOC","ATM Withdrawal Rs 4000.00 From A/C No XXXXXXXXXX400. Balance available Rs 16969.15 - Thank you for banking with BOC","HNB SMS ALERT: PURCHASE, Debit account:0717***4696,Location:MY CHEMIST (PVT) LTD, LK,Amount(Approx.):1399.73 LKR,Av.Bal:101534.94 LKR,Date:28.12.21,Time:13:21, Hot Line:0112462462"]

sms_tokens = [tokenize(sms) for sms in sms_messages]

print(sms_tokens)