In [6]:
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")

In [7]:
import fitz
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import Optional
import json
import ast
from datetime import date

# Your Pydantic models
class RecipientInfo(BaseModel):
    name: str = Field(description="RECIPIENT'S/LENDER'S name")
    street_address: str = Field(description="Street address")
    city: str = Field(description="City or town")
    state: str = Field(description="State or province")
    country: Optional[str] = Field(description="Country", default="United States")
    zip_code: Optional[str] = Field(description="ZIP or foreign postal code")
    phone_number: str = Field(description="Telephone number")

class BorrowerInfo(BaseModel):
    name: str = Field(description="BORROWER'S name")
    street_address: str = Field(description="Street address (including apt. no.)")
    city: Optional[str] = Field(description="City or town")
    state: Optional[str] = Field(description="State or province")
    country: Optional[str] = Field(description="Country")
    zip_code: Optional[str] = Field(description="ZIP or foreign postal code")

class Form1098E(BaseModel):
    recipient: RecipientInfo = Field(description="Recipient/Lender information")
    recipient_tin: str = Field(description="RECIPIENT'S TIN")
    borrower_tin: Optional[str] = Field(description="BORROWER'S TIN")
    borrower: BorrowerInfo = Field(description="Borrower information")
    account_number: str = Field(description="Account number (see instructions)")
    student_loan_interest: Optional[float] = Field(description="Student loan interest received by lender")

def extract_text_with_ocr(pdf_path: str, dpi: int = 300) -> str:
    """Extract text from scanned document using PyMuPDF's OCR"""
    doc = fitz.open(pdf_path)
    text = ""
    
    tessdata_dir = '/opt/homebrew/Cellar/tesseract/5.4.1_1/share/tessdata'
    
    for page in doc:
        tp = page.get_textpage_ocr(
            dpi=dpi,
            full=True,
            tessdata=tessdata_dir
        )
        text += page.get_text(textpage=tp)
    
    doc.close()
    return text

def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    """Process text using LangChain and OpenAI"""
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                      format_instructions=parser.get_format_instructions(),
                                      extracted_text=extracted_text,
                                      postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    return response.content

def extract_1098e_from_pdf(pdf_path: str) -> str:
    """Main function to extract and process 1098-E form"""
    # First extract text using OCR
    extracted_text = extract_text_with_ocr(pdf_path)
    
    # Define the preamble for 1098-E processing
    preamble = ("You're seeing the information from an IRS 1098-E form. "
                "Your job is to accurately extract the recipient's information, borrower's information, "
                "TINs, account number, and student loan interest amount if available. "
                "Ensure all fields are filled with the available information.")
    
    # Process with LangChain
    return compile_template_and_get_llm_response(preamble, extracted_text, Form1098E)



In [8]:
extracted_data = extract_1098e_from_pdf("../raw-docs/IRS-1098e-form.pdf")
print(extracted_data)


{
    "recipient": {
        "name": "Simon Williams",
        "street_address": "134, MarketView Avenue",
        "city": "Florida",
        "state": "FL",
        "country": "United States",
        "zip_code": "® @)24",
        "phone_number": "(545) 533-1234"
    },
    "recipient_tin": "273-43-4323",
    "borrower_tin": "403-433-433",
    "borrower": {
        "name": "Jack B",
        "street_address": "893, Avenue express street",
        "city": "Florida",
        "state": "FL",
        "country": "United States",
        "zip_code": "penalty or other"
    },
    "account_number": "89493322",
    "student_loan_interest": 3200
}


In [9]:
# Process and save the form data
try:
    # Extract and process form
    form_text = extract_1098e_from_pdf("../raw-docs/IRS-1098e-form.pdf")
    
    # Parse the response
    try:
        parsed_data = ast.literal_eval(form_text)
    except (SyntaxError, ValueError):
        try:
            parsed_data = json.loads(form_text)
        except json.JSONDecodeError:
            print("Error: Unable to parse the response string.")
            parsed_data = {}

    # Add metadata
    combined_data = {
        "document_type": "IRS-1098E",
        "extraction_date": str(date.today()),
        "extraction_method": "PyMuPDF OCR + LangChain",
        "extracted_data": parsed_data
    }

    # Save to JSON
    output_file = "../saved-docs/pymupdf_extracted_1098e_data.json"
    with open(output_file, 'w') as f:
        json.dump(combined_data, f, indent=2)

    print(f"\nData written to {output_file}")

except Exception as e:
    print(f"Error: {str(e)}")


Data written to ../saved-docs/pymupdf_extracted_1098e_data.json


In [10]:
# Create retriever and chain
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Load the extracted data
with open("../saved-docs/pymupdf_extracted_1098e_data.json", 'r') as f:
    data = json.load(f)

# Create document for vector store
documents = [
    Document(
        page_content=json.dumps(data['extracted_data']),
        metadata={
            "document_type": data['document_type'],
            "extraction_date": data['extraction_date'],
            "extraction_method": data['extraction_method']
        }
    )
]

# Create vector store and retriever
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever()

# Create the QA chain
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# Helper function for asking questions
def ask_1098e_question(question: str) -> str:
    """Helper function to ask questions about the 1098-E form"""
    response = retrieval_chain.invoke({"input": question})
    return response['answer']

# Example questions specific to your model
questions = [
    "What is the recipient's name and phone number?",
    "What is the borrower's complete address?",
    "What is the recipient's TIN?",
    "What is the account number on the form?",
    "How much student loan interest was received?",
    "What is the borrower's TIN?"
]

# Test the questions
print("\nExample Questions and Answers:")
print("-" * 50)
for question in questions:
    print(f"\nQ: {question}")
    print(f"A: {ask_1098e_question(question)}")




Example Questions and Answers:
--------------------------------------------------

Q: What is the recipient's name and phone number?
A: The recipient's name is Simon Williams and their phone number is (545) 533-1234.

Q: What is the borrower's complete address?
A: The borrower's complete address is 893, Avenue express street, Florida, FL, United States, penalty or other.

Q: What is the recipient's TIN?
A: The recipient's TIN is 273-43-4323.

Q: What is the account number on the form?
A: The account number on the form is 89493322.

Q: How much student loan interest was received?
A: $3,200 of student loan interest was received.

Q: What is the borrower's TIN?
A: The borrower's TIN is "403-433-433".
