In [7]:
import os
from dotenv import load_dotenv
load_dotenv() 
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [8]:
from langchain_community.document_loaders import PyPDFLoader 
loader = PyPDFLoader('clean_invoice_sample.pdf')  
pdf_documents = loader.load() 
print(pdf_documents) 
print("-----------------------------------------------------------------------------------") 
print(type(pdf_documents[0]))

[Document(metadata={'producer': 'Mac OS X 10.10.4 Quartz PDFContext', 'creator': 'Sliced Invoices', 'creationdate': "D:20160204000502Z00'00'", 'title': 'PDF Invoice Example', 'author': 'Sliced Invoices', 'subject': 'Example PDF invoice that was created using the Sli', 'moddate': "D:20160204000502Z00'00'", 'source': 'clean_invoice_sample.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Invoice\nPayment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month.\nThanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com\nPage 1/1\nFrom:\nDEMO - Sliced Invoices\nSuite 5A-1204\n123 Somewhere Street\nYour City AZ 12345\nadmin@slicedinvoices.com\nInvoice Number INV-3337\nOrder Number 12345\nInvoice Date January 25, 2016\nDue Date January 31, 2016\nTotal Due $93.50\nTo:\nTest Business\n123 Somewhere St\nMelbourne, VIC 3000\ntest@test.com\nHrs/Qty Service Rate/Price Adjust Sub Total\n1.00 Web Design\nThis is a sample description.

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap=50) 
# split_document = text_splitter.create_documents(pdf_document[0].page_content)  #by testing single document and through text method
split_document = text_splitter.split_documents(pdf_documents) #directly passing list of documents 
split_document

[Document(metadata={'producer': 'Mac OS X 10.10.4 Quartz PDFContext', 'creator': 'Sliced Invoices', 'creationdate': "D:20160204000502Z00'00'", 'title': 'PDF Invoice Example', 'author': 'Sliced Invoices', 'subject': 'Example PDF invoice that was created using the Sli', 'moddate': "D:20160204000502Z00'00'", 'source': 'clean_invoice_sample.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Invoice\nPayment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month.\nThanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com\nPage 1/1\nFrom:\nDEMO - Sliced Invoices\nSuite 5A-1204\n123 Somewhere Street\nYour City AZ 12345\nadmin@slicedinvoices.com\nInvoice Number INV-3337\nOrder Number 12345\nInvoice Date January 25, 2016\nDue Date January 31, 2016\nTotal Due $93.50\nTo:\nTest Business\n123 Somewhere St\nMelbourne, VIC 3000\ntest@test.com'),
 Document(metadata={'producer': 'Mac OS X 10.10.4 Quartz PDFContext', 'creator': 'Sliced 

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings 
embeddings = HuggingFaceEmbeddings(model="all-MiniLM-L6-v2") 
text = "this a test string" 
query_result = embeddings.embed_query(text) 
print(len(query_result))

384


In [15]:
from langchain_groq import ChatGroq
llm = ChatGroq(
    model="openai/gpt-oss-120b",   # best balance for your use-case
    temperature=0,
)


In [16]:
split_document  # list[Document]
document_text = "\n\n".join([doc.page_content for doc in split_document])


In [17]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
You are an office document classification assistant.

Classify the following document into ONE of:
Invoice, Purchase Order, Contract, HR Document, Internal Memo, Financial Report.

Return STRICT JSON with:
- document_type
- confidence (0 to 1)
- recommended_department
- reasoning (1 short sentence)

Document content:
----------------
{document_text}
""")


In [18]:
chain = prompt | llm
response = chain.invoke({
    "document_text": document_text
})
print(response.content)


{
  "document_type": "Invoice",
  "confidence": 0.99,
  "recommended_department": "Accounting",
  "reasoning": "The document contains invoice number, dates, line items, totals, and payment terms typical of an invoice."
}
