In [16]:
import cv2
import numpy as np
import pytesseract
import re
import os
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [17]:
# OpenCv function to get grayscale image
def get_grayscale(image):                    
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  #This Function in OpenCV2 Provides to get GrayScale 

# noise removal in Given Image
def remove_noise(image):
    return cv2.medianBlur(image,5) #Removing Noice in the Picture is Way more Important 
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotation = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotation

#template matching
#def match_template(image, template):
  #return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [18]:

def extract_text_from_image(image_path,filename):
    image = cv2.imread(image_path)
    # convert to grayscale
    # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # # gray = cv2.resize(gray, None, fx=.5, fy=.5)
    
    # # apply Gaussian blurring to smooth out the image
    # # thresh = cv2.GaussianBlur(gray, (7,7), 0)
    # thresh = cv2.medianBlur(gray,9)

    # # apply Otsu's thresholding to detect text regions
    # # thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 5, 11)
    # # thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # # apply dilation and erosion to remove noise
    # kernel = np.ones((2,2), np.uint8)
    # # kernel1 = np.ones((2,2), np.uint8)
    # thresh = cv2.dilate(thresh, kernel, iterations=1)
    # thresh = cv2.erode(thresh, kernel, iterations=1)

    gray = get_grayscale(image) #Converting Image to a GrayScale Image
    # thresh = thresholding(gray) #Binary Thresholding Apply for the GrayScale Image
    b=remove_noise(gray)
    # openin = opening(b)
    # cannyimg = canny(openin) #Canny Filtering used for the GrayScale Image

    finali = np.hstack((gray,b)) #stacking images side-by-side
    cv2.imwrite("./generated/"+filename,finali)

    # set tesseract OCR to recognize handwritten text
    custom_oem_psm_config = r'-l eng+sin --psm 6'

    # run tesseract OCR on the image
    text = pytesseract.image_to_string(b,config=custom_oem_psm_config)
    # print(text)
    return text



In [19]:
import re

def categorize_lines_old_version1(text):
    # Convert the text to uppercase
    text = text.upper()
    # Regular expression pattern to detect the first line
    # pattern = r'\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*$'
    pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*$'
    # Find the first line matching the specified pattern
    first_line_match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
    print("first_line_match ",first_line_match )
    if first_line_match:
        lines = text.split('\n')
        index = lines.index(first_line_match.group().strip())  # Get the index of the detected line
        print("index",index)
        lines_after_detected = lines[index+1:index+3]  # Get the two lines immediately after the detected line
        first_line = lines_after_detected[0].strip()
        # print("first_line",first_line)
        if re.match(r'^[0-9A-Z/.,\'\s;:-][A-Z/.,\'\s;:-]+$', first_line) :
            return "Category One"
        else:
            return "Category Two"
    else:
        return "No match found."

In [20]:
def categorize_lines(text):
    # Convert the text to uppercase
    text = text.upper()
    # Regular expression pattern to match words containing "ML" or "KG"
    # pattern_for_Density = r'\b\w+(?:ML|KG)\b'
    # Remove words containing "ML" or "KG"
    # text = re.sub(pattern_for_Density, '', text)
    # Regular expression pattern to detect the first line
    pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*?$' 
    
    # Find all lines matching the specified pattern
    matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
    
    # Check if the text matches the pattern
    if matches:
        
        # Store indices of the first two matches
        indices = []
        for match in matches[:2]:
            lines = text.split('\n')
            index = lines.index(match.strip())
            indices.append(index)
        
        if(len(matches)>1):
            # print("Indices of the first two matching lines:", indices[0],indices[1])
            if ((indices[0]+1) == indices[1]):
                index=indices[1]
            else:
                index=indices[0]
        else:
            index=indices[0]     
        
        lines = text.split('\n')
        lines_after_detected = lines[index+1:index+3]  # Get the two lines immediately after the detected line
        # print("lines_after_detected",lines_after_detected)
        
        # Check if the first line of the lines_after_detected fits Category One criteria
        first_line = lines_after_detected[0].strip()
        
        # Regular expression pattern to match the first word if it starts with a symbol
        pattern_for_first_word = r'^[^\w]+'
        # Remove the first word if it starts with a symbol
        first_line = re.sub(pattern_for_first_word, '', first_line).strip()
        first_line = first_line.replace("0", "O")
        # Regular expression pattern to match specific symbols (including ¢)
        pattern_for_special_symbole = r'[¢]'
        # Remove specific symbols from the text
        first_line = re.sub(pattern_for_special_symbole, '', first_line)
        # print("first_line",first_line)
        if re.match(r'^[0-9A-Z/.,\'\s;:-]{6}[A-Z/.,\'\s;:&()-]+$', first_line) :
            return 1
        else:
            return 2
    else:
        return "No match found."

In [21]:
folder = "./bills/"

# 20221022_Food.jpg
# 20220918_Food.jpg
# 20221031_Grocery.jpg
# 20221209_Food_2.jpg

# 20220821_Food.jpg
# 20221030_Food.jpg
# 20221110_Grocery.jpg - dont show - to tae idea
filename='20221110_Grocery.jpg'
extext = extract_text_from_image(folder+filename,filename)
# print(extext)


# category = categorize_lines(extext)
# print("Category:", category)

# find_range(extext)

In [22]:
def find_range(text):
    Top_pattern = r'.*?\b(?:RS|PRICE|ITEM|QTY|AMOUNT|DISCOUNT|PRODUCT|DESCRIPTION)\b.*?$' 
    
    # List of words to check for
    words_to_check = ["NET TOTAL", "GRAND TOTAL", "TOTAL", "NET AMOUNT","CASH","GROSS AMOUNT"]
    # Regular expression pattern to match lines containing specified words
    pattern_for_bottom = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_check) + r')\b'
    
    # Convert the text to uppercase
    text = text.upper()
    lines = text.split('\n')
    Bottom_index=0
    Top_index=0
    top_matches = re.findall(Top_pattern, text, re.IGNORECASE | re.MULTILINE)
    
        
    # Check if the text matches the pattern
    
    if top_matches:
        top_indices = []   
        # Store indices of the first two matches
        
        for match in top_matches[:2]:
            lines = text.split('\n')
            index = lines.index(match.strip())
            top_indices.append(index)
            
           
        if(len(top_matches)>1):
            # print("Indices of the first two matching lines:", indices[0],indices[1])
            if ((top_indices[0]+1) == top_indices[1]):
                index=top_indices[1]
            else:
                index=top_indices[0]
        else:
            index=top_indices[0]   
        Top_index=index
        
    
    
    
    # Split the text into lines and find the indexes of lines matching the pattern
    lines = text.splitlines()
    matching_line_indexes = [index for index, line in enumerate(lines) if re.search(pattern_for_bottom, line)]
    Bottom_index=matching_line_indexes[0]
    
    
    print("top index ",Top_index)
    print(lines[Top_index])
    print("Bottom_index ",Bottom_index)
    print(lines[Bottom_index])
    return {"top": Top_index, "bottom": Bottom_index}
    


In [23]:
def extract_amount(top,bottom,type,text):
    text = text.upper()
    # Define a translation table to replace specified characters with spaces
    translation_table = str.maketrans({"‘":" ","(":" ",")":" ","'": ' ', '"': ' ', '`': ' ', '~': ' ',"_":" ","-":" ",":":" ", '|': ' ', '\\': ' ', '‘': ' ', '“': ' ', '–': ' '})  
    
    # Use translate() method to replace characters with spaces
    text = text.translate(translation_table)
    lines = text.split('\n')
    content=[]
    for line in lines[(top+1):bottom]:
        content.append(line)
        
    print(content)
    
    output_dict = {}

    i = 0
    if type==1:
        while i < ((len(content)//2)*2):
            # Extract key from the first line (remove leading spaces)
            key = content[i].lstrip()
            
            #remove leading numbers also
            #----------------------------#
            #----------------------------#
            #----------------------------#
            #----------------------------#
            
            # Get value from the next line, considering it's a float (handling comma in numbers)
            value = float(content[i + 1].replace(',', '').split()[-1])

            # Add key-value pair to the dictionary
            output_dict[key] = value

            # Move to the next entry
            i += 2
        
        print(output_dict)
    else:
        output_dict = {}

        for item in content:
            # Extract name (characters until the first number is found) and price (last float or integer in the string)
            match = re.search(r'(.+?)(?=\s*\d)', item)
            if match:
                name = match.group(1).strip()
                price = float(re.findall(r'\d+\.\d+|\d+', item)[-1])
                output_dict[name] = price
    
        print(output_dict)


In [24]:
folder = "./bills/"

# 20221022_Food.jpg
# 20220918_Food.jpg
# 20221031_Grocery.jpg--
# 20221209_Food_2.jpg

# 20220821_Food.jpg ---
# 20221030_Food.jpg
# 20221110_Grocery.jpg - dont show - to tae idea
filename='20220821_Food.jpg'
extext = extract_text_from_image(folder+filename,filename)
print(extext)
print("------------------")

extext = extext.upper()

category = categorize_lines(extext)
# print("Category:", category)

range=find_range(extext)
top_value = range["top"]
bottom_value = range["bottom"]

extract_amount(top_value,bottom_value,category,extext)

pi le
“Wo : 10 ’ Ns
“KOT Ref. © 33,36,37
Served By: SMART
d
‘Oi-Jun-2022 were
Bill @ 33 ee
DESCRIPTION arr a.
SGiro-nscs Korru — 2 2000.00
CHIC CHEESE KOTTU L 1 1200.00
SPRITE (400M) ‘ 480.00
EGE 400%. : 120
BISCUIT PUDDING 2 400.00
CREAM CARAMAL 4 1600.00
eee CASH 6380 00
Total Qty : 13.00 2
Sub Total : $800.06
Ser. Charge : $80.00 ©
Bill Amount: 6380.00
Paid Amount 6380.00 :
Bal. Amount : 0.00
Opening hours 10 am to 12
Open 7 days vy
you
we waa Tee

------------------
top index  7
DESCRIPTION ARR A.
Bottom_index  14
EEE CASH 6380 00
['SGIRO NSCS KORRU — 2 2000.00', 'CHIC CHEESE KOTTU L 1 1200.00', 'SPRITE  400M    480.00', 'EGE 400%.   120', 'BISCUIT PUDDING 2 400.00', 'CREAM CARAMAL 4 1600.00']
{'SGIRO NSCS KORRU —': 2000.0, 'CHIC CHEESE KOTTU L': 1200.0, 'SPRITE': 480.0, 'EGE': 120.0, 'BISCUIT PUDDING': 400.0, 'CREAM CARAMAL': 1600.0}


In [28]:
x="""
['  6117 J0VEES FACE WASH TEA TREE 120ML', 
'1,685.00 1.0 1,685.00', 
'3 951003 CHEESE AND ONION BREAD', 
'375.00 1.0 375.900'
]
{'SGIRO NSCS KORRU —': 2000.0, 
'CHIC CHEESE KOTTU L': 1200.0, 
'SPRITE': 480.0, 'EGE': 120.0, 
'BISCUIT PUDDING': 400.0, 
'CREAM CARAMAL': 1600.0}
"""
print(x)


['  6117 J0VEES FACE WASH TEA TREE 120ML', 
'1,685.00 1.0 1,685.00', 
'3 951003 CHEESE AND ONION BREAD', 
'375.00 1.0 375.900'
]
{'SGIRO NSCS KORRU —': 2000.0, 
'CHIC CHEESE KOTTU L': 1200.0, 
'SPRITE': 480.0, 'EGE': 120.0, 
'BISCUIT PUDDING': 400.0, 
'CREAM CARAMAL': 1600.0}


In [26]:
import re

input_list = [
    'y',
'N).227, Anagarika Dnarmapale Mw,Nupe,, Matara',
'U70 108444  0112 303 500 / Store Code: SMMR',
'31-10-2022 12:24:54 €:116296 NP R:2022505',
'-n item Price Qty Amount',
'| 6117:J0VEES FACE WASH TEA TREE 120ML',
'1,685.00 1.0 1,685.00',
'3 951003:CHEESE AND ONION BREAD',
'375.00 1.0 375.900',
'Gross Amount 2,060.00',
'Ne: Amount 2,060.00'
]

output_dict = {}

for item in input_list:
    # Extract name (characters until the first number is found) and price (last float or integer in the string)
    match = re.search(r'(.+?)(?=\s*\d)', item)
    if match:
        name = match.group(1).strip()
        price = float(re.findall(r'\d+\.\d+|\d+', item)[-1])
        output_dict[name] = price

print(output_dict)


{'N).': 227.0, 'U': 500.0, '3': 375.9, '|': 120.0, '1,': 685.0, 'Gross Amount': 60.0, 'Ne: Amount': 60.0}


In [27]:
input_list = [
    '  6117 J0VEES FACE WASH TEA TREE 120ML',
    '1,200.00 1.0 1,685.00',
    '3 951003 CHEESE AND ONION BREAD',
    '200.0 1.0 375.900',
    'aaaaaaaaaaaaa bbbbbbbbb yyyyyyyy',
    '200.0 1.0 365.900',
    'sdgsd gfdgdf bdd'
]
print(input_list)
output_dict = {}

i = 0
while i < ((len(input_list)//2)*2):
    # Extract key from the first line (remove leading spaces)
    key = input_list[i].lstrip()

    # Get value from the next line, considering it's a float (handling comma in numbers)
    value = float(input_list[i + 1].replace(',', '').split()[-1])

    # Add key-value pair to the dictionary
    output_dict[key] = value

    # Move to the next entry
    i += 2

print(output_dict)

i=7
print(((len(input_list)//2)*2))

['  6117 J0VEES FACE WASH TEA TREE 120ML', '1,200.00 1.0 1,685.00', '3 951003 CHEESE AND ONION BREAD', '200.0 1.0 375.900', 'aaaaaaaaaaaaa bbbbbbbbb yyyyyyyy', '200.0 1.0 365.900', 'sdgsd gfdgdf bdd']
{'6117 J0VEES FACE WASH TEA TREE 120ML': 1685.0, '3 951003 CHEESE AND ONION BREAD': 375.9, 'aaaaaaaaaaaaa bbbbbbbbb yyyyyyyy': 365.9}
6
