In [3]:
import langchain

In [4]:
(langchain.__version__)

'1.1.3'

In [5]:
import fitz
import pytesseract
from PIL import Image
from io import BytesIO
import os

In [6]:
from dotenv import load_dotenv

In [7]:
load_dotenv()

True

In [9]:
def tesseract_extract_from_pdf(pdf_path: str) -> str:
    full_ocr_text = ""
    doc = fitz.open(pdf_path)

    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=300)
        img = Image.open(BytesIO(pix.tobytes("ppm")))
        page_text = pytesseract.image_to_string(img)
        full_ocr_text += page_text + "\n\n"
        print(f"Page {i+1}: {len(page_text.strip())} characters")

    doc.close()
    return full_ocr_text

In [None]:
extracted_texts = tesseract_extract_from_pdf(
    r"C:\Users\rajak\Desktop\OCR\OCR\Sale Deed.pdf"
)

Page 1: 2353 characters
Page 2: 2396 characters


In [19]:
extracted_texts

'A\nSALE DEED ——_ | wn\nThis DEED OF ABSOLUTE SALE executed at on this the day of , 201X\n\nby s/o residing at\nB\n\nhereinafter called the VENDOR of the one part which expression shall include his executors,\nadministrators, legal representatives, successors etc.\n\nos\n\nw/o residing at\n\nTO AND IN FAVOUR OF\n\nherein after called the PURCHASER of the Other Part which expression wherever the context\nso requires shall mean and include his heirs, executors, administrators, legal representatives,\nsuccessors etc.\n\nWHEREAS the VENDOR herein has purchased the said property more fully described in\n\nthe Schedule hereunder from Thiru. D  inand by sale deed dated E and registeredon F\nas Document No. G of (year) of Book 1 volume No. wliled aly pages to —_— on the\nfile of the Sub Registrar of — || <—————___\n\nWHEREAS the VENDOR herein has been in exclusive possession and enjoyment of the\nproperty more fully described in the Schedule hereunder with a constructed house thereon\n,which w

In [20]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate

In [21]:
template = """
    You are an expert legal proofreader. Your task is to correct ONLY the transcription and spelling errors in the raw OCR text provided below. 
    
    IMPORTANT GUIDELINES:
    1. Maintain the meaning of the legal text. Do not summarize or add new information.
    2. Output ONLY the fully corrected text, nothing else.

    Original OCR Text:
    ---
    {ocr_text}
    ---

    Corrected Text:
    """

In [22]:
prompt = PromptTemplate(
    template=template,
    input_variables=['ocr_text']
)

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    task="chat-completions",
)

model = ChatHuggingFace(llm = llm)

In [23]:
final_model_chain = prompt | model

In [24]:
final_ocr_text = final_model_chain.invoke(
    {'ocr_text':extracted_texts}
)

In [30]:
final_ocr_text = final_ocr_text.content

In [31]:
type(final_ocr_text)

str

In [26]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [39]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,  # assume 1 token contais 4 character
        
        chunk_overlap=180, 
        
        length_function=len, 
    )

In [40]:
chunks = text_splitter.create_documents([final_ocr_text])

In [41]:
type(chunks)

list

In [42]:
chunks[0]

Document(metadata={}, page_content='SALE DEED\n\nThis DEED OF ABSOLUTE SALE executed at on this the day of , 201X\n\nby s/o residing at\nB\n\nhereinafter called the VENDOR of the one part which expression shall include his executors,\nadministrators, legal representatives, successors etc.\n\nos\n\nw/o residing at\n\nTO AND IN FAVOUR OF\n\nherein after called the PURCHASER of the other part which expression wherever the context\nso requires shall mean and include his heirs, executors, administrators, legal representatives,\nsuccessors etc.\n\nWHEREAS the VENDOR herein has purchased the said property more fully described in\n\nthe Schedule hereunder from Thiru. D  inand by sale deed dated E and registered on F\nas Document No. G of (year) of Book 1 Volume No. W and filed at W pages to —_— on the\nfile of the Sub Registrar of —— || <—————-\n\nWHEREAS the VENDOR herein has been in exclusive possession and enjoyment of the\nproperty more fully described in the Schedule hereunder with a cons

In [43]:
for i, doc in enumerate(chunks):
    print(f"Chunk {i}: {len(doc.page_content)} characters")

Chunk 0: 1214 characters
Chunk 1: 936 characters
