In [1]:
from unstract.llmwhisperer.client import LLMWhispererClient
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")
UNSTRACT_API_KEY = os.getenv("unstract_api_key")
BASE_URL = os.getenv("BASE_URL")

client = LLMWhispererClient(base_url=BASE_URL, api_key=UNSTRACT_API_KEY)

2024-10-21 09:57:42,094 - unstract.llmwhisperer.client - DEBUG - logging_level set to DEBUG
2024-10-21 09:57:42,094 - unstract.llmwhisperer.client - DEBUG - base_url set to https://llmwhisperer-api.unstract.com/v1
2024-10-21 09:57:42,095 - unstract.llmwhisperer.client - DEBUG - api_key set to fc21xxxxxxxxxxxxxxxxxxxxxxxxxxxx


In [4]:
# Processing the PDF file using OCR mode
try:
    result = client.whisper(
        file_path="../raw-docs/IRS-1098e-form.pdf",
        processing_mode="ocr", # "text", "ocr"
        force_text_processing=True
    )
    extracted_text = result["extracted_text"]
    print(extracted_text)
except LLMWhispererClientException as e:
    print(e)

2024-10-21 09:58:37,200 - unstract.llmwhisperer.client - DEBUG - whisper called
2024-10-21 09:58:37,201 - unstract.llmwhisperer.client - DEBUG - api_url: https://llmwhisperer-api.unstract.com/v1/whisper
2024-10-21 09:58:37,202 - unstract.llmwhisperer.client - DEBUG - params: {'url': '', 'processing_mode': 'ocr', 'output_mode': 'line-printer', 'page_seperator': '<<<', 'force_text_processing': True, 'pages_to_extract': '', 'timeout': 200, 'store_metadata_for_highlighting': False, 'median_filter_size': 0, 'gaussian_blur_radius': 0, 'ocr_provider': 'advanced', 'line_splitter_tolerance': 0.4, 'horizontal_stretch_factor': 1.0}





                                                   [X] CORRECTED (if checked) 
 RECIPIENT'S/LENDER'S name, street address, city or town, state or                          OMB No. 1545-1576 
 province, country, ZIP or foreign postal code, and telephone number 
                                                                                                                                Student 
Simon Williams 
 134, MarketView Avenue                                                                       2024                     Loan Interest 
Florida 
                                                                                                                            Statement 
FL 
(545) 533-1234                                                                                Form 1098-E 
 RECIPIENT'S TIN                 BORROWER'S TIN                   1 Student loan interest received by lender                      Copy B 
          273-43-4323                      403-433-433   

In [5]:
from pydantic import BaseModel, Field
from typing import Optional

class RecipientInfo(BaseModel):
    name: str = Field(description="RECIPIENT'S/LENDER'S name")
    street_address: str = Field(description="Street address")
    city: str = Field(description="City or town")
    state: str = Field(description="State or province")
    country: Optional[str] = Field(description="Country", default="United States")
    zip_code: Optional[str] = Field(description="ZIP or foreign postal code")
    phone_number: str = Field(description="Telephone number")

class BorrowerInfo(BaseModel):
    name: str = Field(description="BORROWER'S name")
    street_address: str = Field(description="Street address (including apt. no.)")
    city: Optional[str] = Field(description="City or town")
    state: Optional[str] = Field(description="State or province")
    country: Optional[str] = Field(description="Country")
    zip_code: Optional[str] = Field(description="ZIP or foreign postal code")

class Form1098E(BaseModel):
    recipient: RecipientInfo = Field(description="Recipient/Lender information")
    recipient_tin: str = Field(description="RECIPIENT'S TIN")
    borrower_tin: Optional[str] = Field(description="BORROWER'S TIN")
    borrower: BorrowerInfo = Field(description="Borrower information")
    account_number: str = Field(description="Account number (see instructions)")
    student_loan_interest: Optional[float] = Field(description="Student loan interest received by lender")



In [6]:
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser

def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                        format_instructions=parser.get_format_instructions(),
                                        extracted_text=extracted_text,
                                        postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    print(f"Response from LLM:\n{response.content}")
    return response.content


def extract_irs_1098e_from_text(extracted_text):
    preamble = ("You're seeing the information from an IRS 1098-E form. "
                "Your job is to accurately extract the recipient's information, borrower's information, "
                "TINs, account number, and student loan interest amount if available. "
                "Ensure all fields are filled with the available information.")
    return compile_template_and_get_llm_response(preamble, extracted_text, Form1098E)

In [7]:
extracted_irs_text = extract_irs_1098e_from_text(extracted_text)

Response from LLM:
{
    "recipient": {
        "name": "Simon Williams",
        "street_address": "134, MarketView Avenue",
        "city": "Florida",
        "state": "FL",
        "country": null,
        "zip_code": "2024",
        "phone_number": "(545) 533-1234"
    },
    "recipient_tin": "273-43-4323",
    "borrower_tin": "403-433-433",
    "borrower": {
        "name": "Jack Benny",
        "street_address": "893, Avenue express street",
        "city": "Florida",
        "state": "FL",
        "country": null,
        "zip_code": null
    },
    "account_number": "89493322",
    "student_loan_interest": 3200
}


In [12]:
import json
import ast
from datetime import date

# saving the output as response
response = extracted_irs_text  

# Writing the output to a json file

# Parse the string into a Python dictionary
try:
    parsed_data = ast.literal_eval(response)
except (SyntaxError, ValueError):
    
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError:
        print("Error: Unable to parse the response string.")
        parsed_data = {}

# Add any additional metadata if needed
combined_data = {
    "document_type": "IRS-1098e-form",
    "extraction_date": str(date.today()),  # You might want to use the actual current date
    "extracted_data": parsed_data
}

# Write to a JSON file
output_file = os.path.join('../saved-docs', 'irs_1098e_extracted_data.json')
with open(output_file, 'w') as f:
    json.dump(combined_data, f, indent=2)

print(f"Data written to {output_file}")

Data written to ../saved-docs/irs_1098e_extracted_data.json


In [14]:
# Chain creation and LLM generation

from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Load the JSON file
with open("../saved-docs/irs_1098e_extracted_data.json", 'r') as f:
    data = json.load(f)

# Create documents
documents = [
    Document(
        page_content=json.dumps(data['extracted_data']),
        metadata={
            "document_type": data['document_type'],
            "extraction_date": data['extraction_date']
        }
    )
]

# Create the vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Create the retriever
retriever = vectorstore.as_retriever()

# Load the openai chat model
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)

# QA prompt
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# LLM output
response = retrieval_chain.invoke({"input":"What is the Borrower's city?"})
response['answer']


"The Borrower's city is Florida."