In [18]:
pip install langchain

Collecting langchain
  Downloading langchain-1.2.0-py3-none-any.whl.metadata (4.9 kB)
Collecting langgraph<1.1.0,>=1.0.2 (from langchain)
  Downloading langgraph-1.0.5-py3-none-any.whl.metadata (7.4 kB)
Collecting langgraph-checkpoint<4.0.0,>=2.1.0 (from langgraph<1.1.0,>=1.0.2->langchain)
  Using cached langgraph_checkpoint-3.0.1-py3-none-any.whl.metadata (4.7 kB)
Collecting langgraph-prebuilt<1.1.0,>=1.0.2 (from langgraph<1.1.0,>=1.0.2->langchain)
  Using cached langgraph_prebuilt-1.0.5-py3-none-any.whl.metadata (5.2 kB)
Collecting langgraph-sdk<0.4.0,>=0.3.0 (from langgraph<1.1.0,>=1.0.2->langchain)
  Downloading langgraph_sdk-0.3.1-py3-none-any.whl.metadata (1.6 kB)
Collecting xxhash>=3.5.0 (from langgraph<1.1.0,>=1.0.2->langchain)
  Using cached xxhash-3.6.0-cp313-cp313-win_amd64.whl.metadata (13 kB)
Collecting ormsgpack>=1.12.0 (from langgraph-checkpoint<4.0.0,>=2.1.0->langgraph<1.1.0,>=1.0.2->langchain)
  Downloading ormsgpack-1.12.1-cp313-cp313-win_amd64.whl.metadata (3.3 kB)
D


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
# Let's update to the new ChromaDB API
!pip install -q chromadb==0.4.18  # Ensure we have a compatible version

import chromadb
from chromadb.config import Settings

# Updated imports
import os
from pathlib import Path
from typing import List, Dict, Any
import tempfile
from datetime import datetime
from docx import Document
import docx2txt
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LCDocument
import json


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [None]:
class ARKnowledgeBaseEnhanced:
    """Enhanced knowledge base with document loading capabilities - UPDATED FOR NEW CHROMADB"""
    
    def __init__(self, persist_directory="./chroma_db_enhanced"):
        self.persist_directory = persist_directory
        
        # NEW ChromaDB client initialization
        try:
            # Try the new persistent client
            self.client = chromadb.PersistentClient(path=persist_directory)
            print(f"Created persistent Chroma client at {persist_directory}")
        except:
            # Fallback to in-memory
            self.client = chromadb.EphemeralClient()
            print("Created in-memory Chroma client")
        
        # Create/get collection
        self.collection_name = "ar_knowledge_with_docs"
        try:
            self.collection = self.client.get_collection(self.collection_name)
            print(f"Loaded existing collection: {self.collection_name}")
            self.doc_count = self.collection.count()
        except:
            self.collection = self.client.create_collection(self.collection_name)
            self.doc_count = 0
            print(f"Created new collection: {self.collection_name}")
    
    def load_document(self, file_path: str, metadata: Dict[str, Any] = None):
        """Load a document (DOCX, PDF, TXT) into the knowledge base"""
        file_ext = Path(file_path).suffix.lower()
        
        # Default metadata
        if metadata is None:
            metadata = {
                "source": file_path,
                "type": "document",
                "loaded_date": datetime.now().isoformat()
            }
        
        try:
            if file_ext == '.docx':
                # Method 1: Using docx2txt (better for complex docs)
                text = docx2txt.process(file_path)
                if not text.strip():
                    # Fallback to python-docx
                    doc = Document(file_path)
                    text = "\n".join([para.text for para in doc.paragraphs])
                
                # Create LangChain Document
                documents = [LCDocument(page_content=text, metadata=metadata)]
                
            elif file_ext == '.pdf':
                # For PDFs, we need PyPDF2 or similar
                try:
                    import PyPDF2
                    text = ""
                    with open(file_path, 'rb') as file:
                        pdf_reader = PyPDF2.PdfReader(file)
                        for page in pdf_reader.pages:
                            text += page.extract_text() + "\n"
                    documents = [LCDocument(page_content=text, metadata=metadata)]
                except ImportError:
                    print(" PyPDF2 not installed. Install with: pip install PyPDF2")
                    # Create a dummy document for demo
                    documents = [LCDocument(
                        page_content="PDF content would be extracted here. Install PyPDF2 for full functionality.",
                        metadata=metadata
                    )]
                    
            elif file_ext == '.txt':
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                documents = [LCDocument(page_content=text, metadata=metadata)]
                
            else:
                print(f"Unsupported file type: {file_ext}")
                return False
            
            # Split documents into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len,
                separators=["\n\n", "\n", ". ", " ", ""]
            )
            
            chunks = text_splitter.split_documents(documents)
            
            # Prepare for ChromaDB
            docs_text = [chunk.page_content for chunk in chunks]
            docs_metadata = [chunk.metadata for chunk in chunks]
            docs_ids = [f"doc_{self.doc_count + i}" for i in range(len(chunks))]
            
            # Add to collection
            self.collection.add(
                documents=docs_text,
                metadatas=docs_metadata,
                ids=docs_ids
            )
            
            self.doc_count += len(chunks)
            print(f"Loaded {len(chunks)} chunks from {Path(file_path).name}")
            return True
            
        except Exception as e:
            print(f"Error loading {file_path}: {str(e)[:100]}...")
            return False
    
    def load_sample_documents(self):
        """Load sample AR documents for demonstration"""
        print("ðŸ“š Loading sample AR documents...")
        
        # Create sample documents in memory
        sample_docs = [
            {
                "name": "Cash_Application_SOP.docx",
                "content": """
                CASH APPLICATION STANDARD OPERATING PROCEDURE
                Document ID: SOP-AR-001
                Version: 3.0
                
                1.0 PURPOSE
                This document outlines the procedure for applying customer payments.
                
                2.0 PROCEDURE STEPS
                2.1 Daily Processing
                - Log into SAP system using T-code F-28
                - Download bank statement from lockbox
                - Match payments to open invoices
                - Apply payments before 2 PM daily
                
                2.2 Payment Matching Rules
                - Exact amount match: Apply to specific invoice
                - Partial payment: Apply to oldest invoice first
                - No remittance: Park in unapplied cash (GL 1100.500)
                
                2.3 Exception Handling
                - Short payments: Create deduction ticket
                - Over payments: Create credit memo
                - Payment on account: Apply per customer instruction
                
                3.0 KEY PERFORMANCE INDICATORS
                - Application accuracy: >99%
                - Daily completion rate: 100%
                - Unapplied cash: <2% of total
                """,
                "metadata": {
                    "type": "sop",
                    "category": "cash_application",
                    "version": "3.0",
                    "department": "AR"
                }
            },
            {
                "name": "Deduction_Handling_Guide.docx",
                "content": """
                DEDUCTION HANDLING GUIDE
                
                COMMON DEDUCTION CODES:
                DC01 - Pricing Dispute
                DC02 - Short Shipment
                DC03 - Quality Issues
                DC04 - Promotional Allowance
                DC05 - Freight Dispute
                
                RESOLUTION PROCESS:
                1. Validate the deduction with supporting documents
                2. Check pricing agreements in Salesforce
                3. Contact warehouse for quantity verification
                4. If valid: Create credit memo
                5. If invalid: Send dispute email
                
                ESCALATION MATRIX:
                - Under $1,000: Team Lead
                - $1,000-$5,000: AR Manager
                - Over $5,000: Controller
                
                TIMELINES:
                - Initial review: 2 business days
                - Resolution: 10 business days max
                """,
                "metadata": {
                    "type": "guide",
                    "category": "deductions",
                    "department": "AR"
                }
            },
            {
                "name": "Customer_Specific_Notes.txt",
                "content": """
                CUSTOMER-SPECIFIC NOTES
                
                WALMART (CUST-001):
                - Payments: Every Tuesday and Thursday
                - Deductions: Common for promotional allowances
                - Contacts: John Smith (jsmith@walmart.com)
                - Notes: Always takes 2% discount even when late
                
                AMAZON (CUST-002):
                - Payments: Weekly via ACH
                - Portal: vendorcentral.amazon.com
                - Response time: 48 hours required
                - Notes: Use portal for all communications
                
                TARGET (CUST-003):
                - Payments: 15th and 30th of month
                - Deductions: Require invoice copy
                - Contacts: target_ar@target.com
                - Notes: Accept discounts up to 5 days late
                """,
                "metadata": {
                    "type": "customer_notes",
                    "category": "customers",
                    "department": "AR"
                }
            }
        ]
        
        # Create temporary files and load them
        loaded_count = 0
        for doc_info in sample_docs:
            try:
                # Create temporary file
                with tempfile.NamedTemporaryFile(
                    suffix=Path(doc_info["name"]).suffix,
                    mode='w',
                    delete=False,
                    encoding='utf-8'
                ) as temp_file:
                    
                    if doc_info["name"].endswith('.docx'):
                        # Create actual Word document
                        document = Document()
                        
                        # Split content into paragraphs
                        lines = doc_info["content"].strip().split('\n')
                        for line in lines:
                            if line.strip():
                                # Check for headers
                                if any(header in line for header in 
                                      ['PROCEDURE', 'GUIDE', 'NOTES', 'CODES', 'PURPOSE']):
                                    document.add_heading(line.strip(), 1)
                                elif line.strip().startswith(('1.', '2.', '3.', '- ')):
                                    document.add_heading(line.strip(), 2)
                                else:
                                    document.add_paragraph(line.strip())
                        
                        document.save(temp_file.name)
                    else:
                        # For text files, just write content
                        temp_file.write(doc_info["content"])
                
                # Load the document
                if self.load_document(temp_file.name, doc_info["metadata"]):
                    loaded_count += 1
                
                # Clean up temp file
                os.unlink(temp_file.name)
                
            except Exception as e:
                print(f"Error with {doc_info['name']}: {e}")
        
        print(f"Loaded {loaded_count} sample documents")
        return loaded_count
    
    def search(self, query: str, n_results: int = 3, filter_metadata: Dict = None):
        """Search the knowledge base"""
        try:
            if filter_metadata:
                results = self.collection.query(
                    query_texts=[query],
                    n_results=n_results,
                    where=filter_metadata
                )
            else:
                results = self.collection.query(
                    query_texts=[query],
                    n_results=n_results
                )
            
            return results
            
        except Exception as e:
            print(f"Search error: {e}")
            return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
    
    def get_document_types(self):
        """Get list of document types in the knowledge base"""
        try:
            # Get all metadata
            all_data = self.collection.get()
            metadatas = all_data.get('metadatas', [])
            
            # Extract unique document types
            doc_types = set()
            sources = set()
            
            for meta in metadatas:
                if meta:
                    doc_types.add(meta.get('type', 'unknown'))
                    sources.add(meta.get('source', 'unknown'))
            
            return {
                "document_types": list(doc_types),
                "sources": list(sources),
                "total_chunks": len(metadatas)
            }
            
        except Exception as e:
            print(f"Error getting document types: {e}")
            return {"document_types": [], "sources": [], "total_chunks": 0}
    
    def clear_collection(self):
        """Clear all documents from the collection"""
        try:
            # Get all IDs and delete them
            all_data = self.collection.get()
            if all_data['ids']:
                self.collection.delete(ids=all_data['ids'])
                print(f"Cleared {len(all_data['ids'])} documents from collection")
                self.doc_count = 0
        except Exception as e:
            print(f"Error clearing collection: {e}")

# Initialize the knowledge base
print("Initializing enhanced knowledge base...")
kb_enhanced = ARKnowledgeBaseEnhanced()

# Load sample documents
kb_enhanced.load_sample_documents()

# Show what we have
doc_info = kb_enhanced.get_document_types()
print(f"\nðŸ“Š Knowledge Base Summary:")
print(f"Total document chunks: {doc_info['total_chunks']}")
print(f"Document types: {doc_info['document_types']}")

Initializing enhanced knowledge base...
âœ… Created persistent Chroma client at ./chroma_db_enhanced
âœ… Created new collection: ar_knowledge_with_docs
ðŸ“š Loading sample AR documents...


C:\Users\battih\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 79.3M/79.3M [00:06<00:00, 12.2MiB/s]  


âœ… Loaded 1 chunks from tmp4suvdy5e.docx
âœ… Loaded 1 chunks from tmp2xl70af6.docx
âœ… Loaded 1 chunks from tmpqnxlghq1.txt
âœ… Loaded 3 sample documents

ðŸ“Š Knowledge Base Summary:
Total document chunks: 3
Document types: ['guide', 'customer_notes', 'sop']


In [24]:
# Testing document search

results = kb_enhanced.search('How do I handle deductions', n_results=3)

if results and results.get('documents') and results['documents'][0]:
    for i, (doc,metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
        print(f'\nResult {i+1}: ')
        print(f'Source: {metadata.get(('source','Unknown'))}')
        print(f'Type: {metadata.get('type', 'Unknown')}')
        print(f'Preview: {doc}')
else:
    print('No results found')


Result 1: 
Source: None
Type: guide
Preview: DEDUCTION HANDLING GUIDE

COMMON DEDUCTION CODES:

DC01 - Pricing Dispute

DC02 - Short Shipment

DC03 - Quality Issues

DC04 - Promotional Allowance

DC05 - Freight Dispute

RESOLUTION PROCESS:

1. Validate the deduction with supporting documents

2. Check pricing agreements in Salesforce

3. Contact warehouse for quantity verification

4. If valid: Create credit memo

5. If invalid: Send dispute email

ESCALATION MATRIX:

- Under $1,000: Team Lead

- $1,000-$5,000: AR Manager

- Over $5,000: Controller

TIMELINES:

- Initial review: 2 business days

- Resolution: 10 business days max

Result 2: 
Source: None
Type: customer_notes
Preview: CUSTOMER-SPECIFIC NOTES

                WALMART (CUST-001):
                - Payments: Every Tuesday and Thursday
                - Deductions: Common for promotional allowances
                - Contacts: John Smith (jsmith@walmart.com)
                - Notes: Always takes 2% discount even when late



In [27]:
# Testing filtered search for SOP documents only

sop_results = kb_enhanced.search(
    'payment processing',
    n_results=2,
    filter_metadata={'type':'sop'}
)

if sop_results and sop_results.get('documents') and sop_results['documents'][0]:
    for i, doc in enumerate(sop_results['documents'][0]):
        print(f'\n SOP {i+1}: {doc}')
else:
    print('No SOP documents found')


 SOP 1: CASH APPLICATION STANDARD OPERATING PROCEDURE

Document ID: SOP-AR-001

Version: 3.0

1.0 PURPOSE

This document outlines the procedure for applying customer payments.

2.0 PROCEDURE STEPS

2.1 Daily Processing

- Log into SAP system using T-code F-28

- Download bank statement from lockbox

- Match payments to open invoices

- Apply payments before 2 PM daily

2.2 Payment Matching Rules

- Exact amount match: Apply to specific invoice

- Partial payment: Apply to oldest invoice first

- No remittance: Park in unapplied cash (GL 1100.500)

2.3 Exception Handling

- Short payments: Create deduction ticket

- Over payments: Create credit memo

- Payment on account: Apply per customer instruction

3.0 KEY PERFORMANCE INDICATORS

- Application accuracy: >99%

- Daily completion rate: 100%

- Unapplied cash: <2% of total


In [31]:
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.messages import HumanMessage, AIMessage
import getpass

if 'GOOGLE_API_KEY' not in os.environ:
    os.environ['GOOGLE_API_KEY'] = getpass.getpass('Enter you API Key')

llm = GoogleGenerativeAI(model='gemini-2.5-flash',temperature=0.1)
embeddings = GoogleGenerativeAIEmbeddings(model='text-embedding-ada-002')

In [33]:
def enhanced_retrieve_context(state, knowledge_base):
    '''Retrieve context from knowledge base with document awareness'''
    query = state['query']

    print(f'Searching documents for: {query}')

    filter_metadata = None
    query_lower = query.lower()

    if any(term in query_lower for term in ['sop', 'procedure', 'process', 'step']):
        filter_metadata={'type':'sop'}
    
    elif any(term in query_lower for term in ['customer', 'walmart', 'amazon','target']):
        filter_metadata = {'type':'customer_notes'}
    elif any(term in query_lower for term in ['template', 'email', 'communication']):
        filter_metadata = {'type':'template'}
    elif any(term in query_lower for term in ['deduction', 'dc01', 'dc02','dc03']):
        filter_metadata = {'type':'guide'}
    
    results = knowledge_base.search(query,n_results=3, filter_metadata=filter_metadata)

    context_docs = []

    if results and results.get('documents'):
        documents = results['documents'][0]
        metadatas = results.get('metadatas', [[]])[0]
        for i, (doc,meta) in enumerate(zip(documents,metadatas)):
            source_name = Path(meta.get('source','unknown_document')).name
            doc_type = meta.get('type', 'Document')
            context_docs.append(f'[From: {source_name} | Type: {doc_type}]\n {doc} ')

    state['context'] = context_docs
    print(f'Retrieved {len(context_docs)} relevant document(s)')

    return state

test_state = {
    'query': 'How do I apply payments in SAP',
    'context': [],
    'messages':[]
}

print('\n Testing Retrieval')
result_state = enhanced_retrieve_context(test_state,kb_enhanced)
print(f'Retrieved {len(result_state['context'])} document(s)')
print(f'First document preview: \n{result_state['context'][0]}')


 Testing Retrieval
Searching documents for: How do I apply payments in SAP
Retrieved 3 relevant document(s)
Retrieved 3 document(s)
First document preview: 
[From: unknown_document | Type: sop]
 CASH APPLICATION STANDARD OPERATING PROCEDURE

Document ID: SOP-AR-001

Version: 3.0

1.0 PURPOSE

This document outlines the procedure for applying customer payments.

2.0 PROCEDURE STEPS

2.1 Daily Processing

- Log into SAP system using T-code F-28

- Download bank statement from lockbox

- Match payments to open invoices

- Apply payments before 2 PM daily

2.2 Payment Matching Rules

- Exact amount match: Apply to specific invoice

- Partial payment: Apply to oldest invoice first

- No remittance: Park in unapplied cash (GL 1100.500)

2.3 Exception Handling

- Short payments: Create deduction ticket

- Over payments: Create credit memo

- Payment on account: Apply per customer instruction

3.0 KEY PERFORMANCE INDICATORS

- Application accuracy: >99%

- Daily completion rate: 100%

- Unapp

In [None]:
from langchain_classic.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def create_document_aware_response(query, knowledge_base:ARKnowledgeBaseEnhanced, experience_level ='new'):
    '''Create a response using document context'''

    print(f'Processing query: {query}')

    results = knowledge_base.search(query, n_results=3)

    context_text = ''

    if results and results.get('documents'):
        documents = results['documents'][0]
        metadatas = results.get('metadatas',[[]])[0]

        context_text = '\n\n---Relevant document---\n'

        for i, (doc, meta) in enumerate(zip(documents,metadatas)):
            source = Path(meta.get('source','Document')).name
            doc_type = meta.get('type','Info')
            context_text += f'\nDocument {i+1}: {source} ({doc_type})\n'
            context_text += f'Content: {doc}\n'

    prompt_template = ChatPromptTemplate.from_messages([
        ('system', '''You are an expert Accounts recivable trainer helping a new team member.
         You have access to company documents. Use them to provide accurate, helpful answers.
         
         If the answer is in the documents, reference them specifically.
         if not, use your general knowledge but say so.
         
         Tailor your response for {experience_level} users.
         Be clear, step-by-step, and practical.
         
         Available documents:
         {context}
         Now answer the user's question:'''),
         ('human', '{query}')
    ])

    chain = prompt_template | llm | StrOutputParser()

    response = chain.invoke({
        'query':query,
        'context':context_text,
        'experience_level':experience_level
    })

    return response

test_queries = [
    ('How do I handle customer deductions','new'),
    ('What is the procedure of cash application','new'),
    ('Tell me about wallmart payment process','intermidiate'),
    ('How do I use SAP for payments', 'experienced')
]

for query,level in test_queries:
    print(f'\n Query: {query}')
    response = create_document_aware_response(query, kb_enhanced, level)
    print(response)
    print('-'*100)


 Query: How do I handle customer deductions
Processing query: How do I handle customer deductions
Welcome to the team! Handling customer deductions is a key part of Accounts Receivable. Don't worry, we have a clear process for it.

Hereâ€™s a step-by-step guide on how to handle customer deductions, drawing from our company documents:

### **1. Initial Identification & Ticket Creation**

When you encounter a **short payment** (meaning the customer paid less than the invoice amount), your first step is to **create a deduction ticket**. This is part of our standard process for Cash Application.
*(Reference: Document 3, Cash Application Standard Operating Procedure, Section 2.3 Exception Handling)*

### **2. Understand Common Deduction Reasons**

Once a deduction ticket is created, you'll categorize it using a deduction code. Here are some of the most common ones you'll see:
*   **DC01 - Pricing Dispute:** The customer believes they were charged incorrectly.
*   **DC02 - Short Shipment:**