In [4]:
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")

In [5]:
import fitz
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import Optional
import json
import ast
from datetime import date

# Your Pydantic models
class RecipientInfo(BaseModel):
    name: str = Field(description="RECIPIENT'S/LENDER'S name")
    street_address: str = Field(description="Street address")
    city: str = Field(description="City or town")
    state: str = Field(description="State or province")
    country: Optional[str] = Field(description="Country", default="United States")
    zip_code: Optional[str] = Field(description="ZIP or foreign postal code")
    phone_number: str = Field(description="Telephone number")

class BorrowerInfo(BaseModel):
    name: str = Field(description="BORROWER'S name")
    street_address: str = Field(description="Street address (including apt. no.)")
    city: Optional[str] = Field(description="City or town")
    state: Optional[str] = Field(description="State or province")
    country: Optional[str] = Field(description="Country")
    zip_code: Optional[str] = Field(description="ZIP or foreign postal code")

class Form1098E(BaseModel):
    recipient: RecipientInfo = Field(description="Recipient/Lender information")
    recipient_tin: str = Field(description="RECIPIENT'S TIN")
    borrower_tin: Optional[str] = Field(description="BORROWER'S TIN")
    borrower: BorrowerInfo = Field(description="Borrower information")
    account_number: str = Field(description="Account number (see instructions)")
    student_loan_interest: Optional[float] = Field(description="Student loan interest received by lender")



In [7]:
import PyPDF2
import pytesseract
from PIL import Image
import pdf2image
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
import json
import ast
from datetime import date

def extract_text_from_scanned_pdf(pdf_path: str) -> str:
    """Extract text from scanned PDF using PyPDF2 + Tesseract"""
    try:
        images = pdf2image.convert_from_path(pdf_path)
        text = ""
        for i, image in enumerate(images):
            print(f"Processing page {i+1}...")
            text += pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return ""

def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    """Process text using LangChain and OpenAI"""
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                      format_instructions=parser.get_format_instructions(),
                                      extracted_text=extracted_text,
                                      postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    return response.content

def extract_1098e_from_pdf(pdf_path: str) -> str:
    """Main function to extract and process 1098-E form"""
    # First extract text using OCR
    extracted_text = extract_text_from_scanned_pdf(pdf_path)
    
    # Define the preamble for 1098-E processing
    preamble = ("You're seeing the information from an IRS 1098-E form. "
                "Your job is to accurately extract the recipient's information, borrower's information, "
                "TINs, account number, and student loan interest amount if available. "
                "Ensure all fields are filled with the available information.")
    
    # Process with LangChain
    return compile_template_and_get_llm_response(preamble, extracted_text, Form1098E)

# Process the form
pdf_path = "../raw-docs/IRS-1098e-form.pdf"

try:
    # Extract and process form
    form_text = extract_1098e_from_pdf(pdf_path)
    
    # Parse the response
    try:
        parsed_data = ast.literal_eval(form_text)
    except (SyntaxError, ValueError):
        try:
            parsed_data = json.loads(form_text)
        except json.JSONDecodeError:
            print("Error: Unable to parse the response string.")
            parsed_data = {}

    # Add metadata
    combined_data = {
        "document_type": "IRS-1098E",
        "extraction_date": str(date.today()),
        "extraction_method": "PyPDF2 OCR + LangChain",
        "extracted_data": parsed_data
    }

    # Save to JSON
    output_file = "../saved-docs/pypdf_extracted_1098e_data.json"
    with open(output_file, 'w') as f:
        json.dump(combined_data, f, indent=2)

    print(f"\nData written to {output_file}")

except Exception as e:
    print(f"Error: {str(e)}")

# Helper function to ask questions about the form
def ask_1098e_question(question: str, parsed_data: dict) -> str:
    """Ask questions about the extracted 1098-E data"""
    prompt = f"""Based on the following Form 1098-E data:
    {json.dumps(parsed_data, indent=2)}
    
    Please answer this question: {question}
    
    Provide a clear, concise answer with specific numbers when relevant."""
    
    llm = ChatOpenAI(temperature=0)
    response = llm.invoke(prompt)
    return response.content

# Example questions
questions = [
    "What is the recipient's name and phone number?",
    "What is the borrower's complete address?",
    "What is the recipient's TIN?",
    "What is the account number on the form?",
    "How much student loan interest was received?",
    "What is the borrower's TIN?"
]


# Ask questions about the extracted data
if parsed_data:
    print("\nAnswering Questions about Form 1098-E:")
    for question in questions:
        print(f"\nQ: {question}")
        # Remove ['extracted_data'] since parsed_data is already the extracted data
        answer = ask_1098e_question(question, parsed_data)
        print(f"A: {answer}")

Processing page 1...

Data written to ../saved-docs/pypdf_extracted_1098e_data.json

Answering Questions about Form 1098-E:

Q: What is the recipient's name and phone number?
A: The recipient's name is Simon Williams Student and their phone number is (545) 533-1234.

Q: What is the borrower's complete address?
A: The borrower's complete address is 893 Avenue Express Street, Florida, FL, 89493322.

Q: What is the recipient's TIN?
A: The recipient's TIN is 273-43-4323.

Q: What is the account number on the form?
A: The account number on the form is 89493322.

Q: How much student loan interest was received?
A: The amount of student loan interest received was $3,200.

Q: What is the borrower's TIN?
A: The borrower's TIN is 403-433-433.
