# Using Pytesseract OCR For Bounding Box Calculation

In [3]:
import pytesseract
from bs4 import BeautifulSoup
from PIL import Image
import os


In [4]:
import pandas as pd

train_df = pd.read_csv("/kaggle/input/csv-file/output_data_train.csv")
train_df.head()

Unnamed: 0,invoice_number,issue_date,total,table,title
0,,21/08/2018,76.48,Descriplion of Goods HS Tarift Packagcs quanti...,Commercial-invoice-example.jpg
1,,,,,5th.jpg
2,1009624,04/01/2020,1431.27,"Summary of Charges\nPrevious Balance 6,632.88\...",sample-voip-invoice-page-1.jpg
3,20-4500,12/01/2018,52844.8,descRiPTCN oTy' HR PRICE amm\nxtchen gink 'hou...,Sample-Invoice-printable.png
4,4908291912124017,,5599.0,ITEM DESCRIPTION Transaction SALE DiscoU] NET ...,EM4uiZWVAAAIDXW.jpg


In [5]:
train_df.isna().sum()

invoice_number    13
issue_date         4
total              5
table              4
title              0
dtype: int64

# IoU (Intersection Over Union) Calculation

In [4]:
def calculate_iou(box1, box2):
    
    # Calculate the coordinates of the intersection rectangle
    x1_inter = max(box1[0], box2[0])
    y1_inter = max(box1[1], box2[1])
    x2_inter = min(box1[2], box2[2])
    y2_inter = min(box1[3], box2[3])

    # Calculate the area of the intersection rectangle
    area_inter = max(0, x2_inter - x1_inter + 1) * max(0, y2_inter - y1_inter + 1)

    # Calculate the area of each bounding box
    area_box1 = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
    area_box2 = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)

    # Calculate the area of the union of the two bounding boxes
    area_union = area_box1 + area_box2 - area_inter

    # Calculate IoU
    iou = area_inter / area_union if area_union > 0 else 0.0

    return iou

# Example usage:
box1 = [76, 344, 857, 449]
box2 = [67, 337, 831, 391]

iou_score = calculate_iou(box1, box2)
print("IoU Score:", iou_score)


IoU Score: 0.4092062382300206


In [7]:
import pandas as pd

# Bounding Box Calculation on Train Dataset

In [5]:
def bounding_box(x):
    
    folder_path = f'/kaggle/input/llm-competition/llm-competition/LLM Competition/Train Files/{x}'
    img = Image.open(folder_path)

    # Perform OCR and get hOCR output
    hocr_output = pytesseract.image_to_pdf_or_hocr(img, extension='hocr')

    # Parse the hOCR content using BeautifulSoup
    soup = BeautifulSoup(hocr_output, 'html.parser')

    # Extract bounding box information from the parsed content
    bounding_boxes = []
    for span in soup.find_all('span', class_='ocrx_word'):
        # Extract text
        extracted_text = span.get_text(strip=True)

        # Extract bounding box coordinates
        bbox = span['title'].split(';')[0].split(' ')[1:]
        bbox = [int(coord) for coord in bbox]

        bounding_boxes.append({
            'text': extracted_text,
            'bounding_box': bbox
        })
    return bounding_boxes

# # Print the results
# for entry in bounding_boxes:
#     print('Text:', entry['text'])
#     print('Bounding Box:', entry['bounding_box'])
#     print()

    

In [13]:
train_result_df = train_df['title'].apply(lambda x: bounding_box(x))

In [14]:
# count = 0
# for box in result_df[0]:
#     count += 1
#     print(count)
#     print(box['bounding_box'])

# target_list = ['']

target_list = ['Contact','Commercial','Here']
for bounding_box in train_result_df[0]:  #train_result_df[0] is a list of dictionary
    box = bounding_box['bounding_box']
    text = bounding_box['text']
    for target in target_list:
        if target == text:
            print(f"Text: {text} box:{box}")
    
        


Text: Commercial box:[398, 46, 508, 59]
Text: Contact box:[61, 90, 105, 99]
Text: Contact box:[61, 278, 105, 286]
Text: Contact box:[631, 448, 675, 457]


# Retrieving the bounding box of the training set

In [70]:
def retrieve_bounding_box(x, y, z, bounding_boxes_lists):
    text_extraction_row = []
    bounding_box_extraction_row = []
    # x, y, and z are pandas Series representing columns of DataFrame
    for index, row in zip(x.index, zip(x, y, z)):  #looping through each row
        bounding_boxes = bounding_boxes_lists[index]  #updating the bounding_boxes_lists on next loop
#         print(bounding_boxes)
        target_list = [row[0], row[1], row[2]]  
    
        for bounding_box in bounding_boxes:  #result_df[0] is a list of dictionary
            box = bounding_box['bounding_box']
            text = bounding_box['text']
            for target in target_list:
                if target == text:
                    
                    text_extraction_row.append(text)
                    bounding_box_extraction_row.append(box) 
                    
                
    return text_extraction_row, bounding_box_extraction_row


result = retrieve_bounding_box(df['invoice_number'],df['total'], df['issue_date'], result_df)

if result:
    text, bounding_box = result
    text = zip(text,bounding_box)
    print(tuple(text))
else:
    print("No matching bounding box found.")


()


# Bounding Box prediction process of test dataset

In [6]:
test_df = pd.read_csv("/kaggle/input/output-data-csv/output_data_test.csv")

In [7]:
test_df.head()

Unnamed: 0,invoice_number,issue_date,total,table,title
0,22011,01/08/2020,436.45,Date Services Hours Rates Amount\n08/01/2020 A...,mceclip2.png
1,4631508,13/05/2013,1445.99,Line No_ Line No_ Description Shipped (USD) (U...,p1.jpg
2,,05/02/2022,,,Commercial-Invoice-Template-19.jpg
3,To00000001,10/10/2020,712000.0,ITEM NAME QTy PRICE{ DISCOUNT/ GST AMOUNT UNIT...,image-414 (1).png
4,,19/11/2020,7000.0,Particulars Quantity Rate Amount\nRepairs and ...,service-invoice.jpg


In [8]:
import numpy as np

test_df.insert(loc = 1, column = 'invoice_number_bbox' , value = np.nan)
test_df.insert(loc = 3, column = 'issue_date_bbox' , value = np.nan)
test_df.insert(loc = 5, column = 'total_bbox' , value = np.nan)

In [9]:
test_df.head()
# test_add_bbox_df.drop('invoice_number_bbox', axis = 'columns', inplace = True)

Unnamed: 0,invoice_number,invoice_number_bbox,issue_date,issue_date_bbox,total,total_bbox,table,title
0,22011,,01/08/2020,,436.45,,Date Services Hours Rates Amount\n08/01/2020 A...,mceclip2.png
1,4631508,,13/05/2013,,1445.99,,Line No_ Line No_ Description Shipped (USD) (U...,p1.jpg
2,,,05/02/2022,,,,,Commercial-Invoice-Template-19.jpg
3,To00000001,,10/10/2020,,712000.0,,ITEM NAME QTy PRICE{ DISCOUNT/ GST AMOUNT UNIT...,image-414 (1).png
4,,,19/11/2020,,7000.0,,Particulars Quantity Rate Amount\nRepairs and ...,service-invoice.jpg


In [62]:
# df.drop??

In [4]:
def bounding_box(x):
    
    folder_path = f'/kaggle/input/llm-competition/llm-competition/LLM Competition/Test Files/{x}'
    img = Image.open(folder_path)

    # Perform OCR and get hOCR output
    hocr_output = pytesseract.image_to_pdf_or_hocr(img, extension='hocr')

    # Parse the hOCR content using BeautifulSoup
    soup = BeautifulSoup(hocr_output, 'html.parser')

    # Extract bounding box information from the parsed content
    bounding_boxes = []
    for span in soup.find_all('span', class_='ocrx_word'):
        # Extract text
        extracted_text = span.get_text(strip=True)

        # Extract bounding box coordinates
        bbox = span['title'].split(';')[0].split(' ')[1:]
        bbox = [int(coord) for coord in bbox]

        bounding_boxes.append({
            'text': extracted_text,
            'bounding_box': bbox
        })
    return bounding_boxes

# # Print the results
# for entry in bounding_boxes:
#     print('Text:', entry['text'])
#     print('Bounding Box:', entry['bounding_box'])
#     print()

    

In [10]:
test_result_df = test_df['title'].apply(lambda x: bounding_box(x))

In [11]:
test_result_df[0][10:15]

[{'text': '', 'bounding_box': [22, 527, 742, 558]},
 {'text': '', 'bounding_box': [491, 615, 630, 633]},
 {'text': '‘Smith', 'bounding_box': [24, 31, 61, 41]},
 {'text': '&', 'bounding_box': [66, 31, 75, 41]},
 {'text': 'Smith,', 'bounding_box': [81, 31, 122, 43]}]

In [18]:
def retrieve_bounding_box(df, bounding_boxes_lists):
    x = df['invoice_number']
    y = df['issue_date']
    z = df['total']
#     text_extraction_row = []
#     bounding_box_extraction_row = []
    # x, y, and z are pandas Series representing columns of DataFrame
    for index, row in zip(x.index, zip(x, y, z)):  #looping through each row
        bounding_boxes = bounding_boxes_lists[index]  #updating the bounding_boxes_lists on next loop
#         print(bounding_boxes)
#         target_list = [row[0], row[1], row[2]]  
    
        for bounding_box in bounding_boxes:  #result_df[0] is a list of dictionary
            box = bounding_box['bounding_box']
            text = bounding_box['text']
#             print(text)
            if (text == row[0]):
                df.at[index,'invoice_number_bbox'] = str(box)
            elif (text == row[1]):
                df.at[index,'issue_date_bbox'] = str(box)
            elif (text == row[2]):
                df.at[index,'total_bbox'] = str(box)
                
    return df

                
result = retrieve_bounding_box(test_df, test_result_df)

# if result:
#     df = result
#     print(df.head())

  df.at[index,'total_bbox'] = str(box)


In [19]:
test_df.head()

Unnamed: 0,invoice_number,invoice_number_bbox,issue_date,issue_date_bbox,total,total_bbox,table,title
0,22011,"[688, 191, 739, 206]",01/08/2020,,436.45,,Date Services Hours Rates Amount\n08/01/2020 A...,mceclip2.png
1,4631508,"[785, 162, 846, 194]",13/05/2013,,1445.99,,Line No_ Line No_ Description Shipped (USD) (U...,p1.jpg
2,,,05/02/2022,,,,,Commercial-Invoice-Template-19.jpg
3,To00000001,"[323, 120, 408, 148]",10/10/2020,,712000.0,,ITEM NAME QTy PRICE{ DISCOUNT/ GST AMOUNT UNIT...,image-414 (1).png
4,,,19/11/2020,,7000.0,,Particulars Quantity Rate Amount\nRepairs and ...,service-invoice.jpg


In [21]:
test_df

Unnamed: 0,invoice_number,invoice_number_bbox,issue_date,issue_date_bbox,total,total_bbox,table,title
0,22011,"[688, 191, 739, 206]",01/08/2020,,436.45,,Date Services Hours Rates Amount\n08/01/2020 A...,mceclip2.png
1,4631508,"[785, 162, 846, 194]",13/05/2013,,1445.99,,Line No_ Line No_ Description Shipped (USD) (U...,p1.jpg
2,,,05/02/2022,,,,,Commercial-Invoice-Template-19.jpg
3,To00000001,"[323, 120, 408, 148]",10/10/2020,,712000.0,,ITEM NAME QTy PRICE{ DISCOUNT/ GST AMOUNT UNIT...,image-414 (1).png
4,,,19/11/2020,,7000.0,,Particulars Quantity Rate Amount\nRepairs and ...,service-invoice.jpg
5,C1-10012014,,10/07/2014,,80000.0,,DESCRIPTION QUANTITY TOTAL PRICE\nDOORS AND WI...,commercial-invoice-sample.png
6,BS 10009,,11/6/2013,"[686, 239, 745, 249]",120.0,,Product Price Quantity Line Total\nFlying Ninj...,07_Invoice_Example.png
7,,,14/09/2017,,918.0,"[2200, 3010, 2310, 3041]",Unit Item Price VAT Total\nSM-NO5OFZKABTU Sams...,Samsung Invoice.jpg
8,MAX1234,,19/11/2020,,93000.0,,Description Goods Quantity Rate Amount\nAsus 1...,comercial-invoice.jpg
9,,,07/08/2012,,39.65,,,e1a0ba8226a03551145610bd34968cc5.jpg
