## Import Libraries

In [2]:
import os
import io
import uuid
import asyncio
import logging
import mimetypes
from datetime import datetime
from typing import List, Dict, Any, Optional, Union
from pathlib import Path

# Core frameworks
import aiofiles
from fastapi import FastAPI, UploadFile, File, HTTPException, Depends
from pydantic import BaseModel, Field, validator
import uvicorn

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.llms import OpenAI, Anthropic
from langchain.chat_models import ChatOpenAI, ChatAnthropic
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
from langchain.schema import Document
from langchain.callbacks import StreamingStdOutCallbackHandler

# Document processing
import PyPDF2
from pdf2image import convert_from_bytes
import pytesseract
from PIL import Image
import docx
import pandas as pd

# Vector store and database
import chromadb
from pymongo import MongoClient
import motor.motor_asyncio

# Utilities
import tiktoken
from sentence_transformers import SentenceTransformer

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
# Configuration
class Config:
    # API Keys
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
    
    # Database
    MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
    DATABASE_NAME = "chat_pdf_db"
    
    # Vector Store
    CHROMA_PERSIST_DIR = "./chroma_db"
    FAISS_INDEX_PATH = "./faiss_index"
    
    # File Processing
    MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
    SUPPORTED_FORMATS = {
        'application/pdf': 'pdf',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
        'application/msword': 'doc',
        'text/plain': 'txt',
        'application/vnd.ms-excel': 'xls',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
        'image/png': 'png',
        'image/jpeg': 'jpg',
        'image/tiff': 'tiff'
    }
    
    # Text Processing
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200
    MAX_TOKENS = 4000

config = Config()

In [5]:
# Pydantic Models
# Document metadata
class DocumentMetadata(BaseModel):
    file_name: str
    file_type: str
    file_size: int
    page_count: int = 0
    word_count: int = 0
    character_count: int = 0
    upload_date: datetime = Field(default_factory=datetime.now)
    processing_status: str = "pending"
    error_message: Optional[str] = None

In [6]:
class ChatMessage(BaseModel):
    role: str  # 'user' or 'assistant'
    content: str
    timestamp: datetime = Field(default_factory=datetime.now)
    sources: Optional[List[str]] = None

In [7]:
class ChatSession(BaseModel):
    session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    document_id: str
    messages: List[ChatMessage] = []
    created_at: datetime = Field(default_factory=datetime.now)
    updated_at: datetime = Field(default_factory=datetime.now)

In [8]:
class QueryRequest(BaseModel):
    question: str
    session_id: Optional[str] = None
    document_id: str
    max_tokens: Optional[int] = 1000
    temperature: Optional[float] = 0.7

In [9]:
class QueryResponse(BaseModel):
    answer: str
    sources: List[str]
    session_id: str
    confidence_score: Optional[float] = None

In [10]:
# Database Manager
class DatabaseManager:
    
    def __init__(self):
        self.client = motor.motor_asyncio.AsyncIOMotorClient(config.MONGODB_URL)
        self.db = self.client[config.DATABASE_NAME]
        self.documents = self.db.documents
        self.sessions = self.db.chat_sessions
        
    async def save_document(self, doc_id: str, metadata: DocumentMetadata) -> bool:
        try:
            await self.documents.update_one(
                {"_id": doc_id},
                {"$set": metadata.dict()},
                upsert=True
            )
            return True
        except Exception as e:
            logger.error(f"Error saving document metadata: {e}")
            return False
    
    async def get_document(self, doc_id: str) -> Optional[DocumentMetadata]:
        try:
            result = await self.documents.find_one({"_id": doc_id})
            if result:
                result.pop("_id")
                return DocumentMetadata(**result)
            return None
        except Exception as e:
            logger.error(f"Error retrieving document: {e}")
            return None
    
    async def save_chat_session(self, session: ChatSession) -> bool:
        try:
            session.updated_at = datetime.now()
            await self.sessions.update_one(
                {"session_id": session.session_id},
                {"$set": session.dict()},
                upsert=True
            )
            return True
        except Exception as e:
            logger.error(f"Error saving chat session: {e}")
            return False
    
    async def get_chat_session(self, session_id: str) -> Optional[ChatSession]:
        try:
            result = await self.sessions.find_one({"session_id": session_id})
            if result:
                result.pop("_id")
                return ChatSession(**result)
            return None
        except Exception as e:
            logger.error(f"Error retrieving chat session: {e}")
            return None


In [11]:
# Document Processor
class DocumentProcessor:
    
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    async def process_document(self, file: UploadFile, doc_id: str) -> tuple[List[Document], DocumentMetadata]:
        try:
            # Read file content
            content = await file.read()
            
            # Detect file type
            mime_type = file.content_type or mimetypes.guess_type(file.filename)[0]
            if mime_type not in config.SUPPORTED_FORMATS:
                raise ValueError(f"Unsupported file type: {mime_type}")
            
            # Extract text based on file type
            text, page_count = await self._extract_text(content, mime_type, file.filename)
            
            # Create metadata
            metadata = DocumentMetadata(
                file_name=file.filename,
                file_type=config.SUPPORTED_FORMATS[mime_type],
                file_size=len(content),
                page_count=page_count,
                word_count=len(text.split()),
                character_count=len(text),
                processing_status="completed"
            )
            
            # Split text into chunks
            chunks = self.text_splitter.split_text(text)
            
            # Create Document objects
            documents = [
                Document(
                    page_content=chunk,
                    metadata={
                        "document_id": doc_id,
                        "file_name": file.filename,
                        "chunk_index": i,
                        "total_chunks": len(chunks)
                    }
                )
                for i, chunk in enumerate(chunks)
            ]
            
            return documents, metadata
            
        except Exception as e:
            logger.error(f"Error processing document: {e}")
            metadata = DocumentMetadata(
                file_name=file.filename,
                file_type="unknown",
                file_size=len(content) if 'content' in locals() else 0,
                processing_status="failed",
                error_message=str(e)
            )
            return [], metadata
    
    async def _extract_text(self, content: bytes, mime_type: str, filename: str) -> tuple[str, int]:
        
        if mime_type == 'application/pdf':
            return await self._extract_from_pdf(content)
        elif mime_type in ['image/png', 'image/jpeg', 'image/tiff']:
            return await self._extract_from_image(content), 1
        elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            return await self._extract_from_docx(content), 1
        elif mime_type == 'text/plain':
            return content.decode('utf-8'), 1
        elif mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
            return await self._extract_from_excel(content), 1
        else:
            raise ValueError(f"Unsupported MIME type: {mime_type}")
    
    async def _extract_from_pdf(self, content: bytes) -> tuple[str, int]:
        try:
            # Try standard PDF text extraction first
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(content))
            text = ""
            page_count = len(pdf_reader.pages)
            
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                text += page_text + "\n"
            
            # If no text extracted, use OCR
            if not text.strip():
                logger.info("No text found in PDF, using OCR...")
                text = await self._ocr_pdf(content)
            
            return text, page_count
            
        except Exception as e:
            logger.error(f"Error extracting from PDF: {e}")
            # Fallback to OCR
            return await self._ocr_pdf(content), 1
    
    async def _ocr_pdf(self, content: bytes) -> str:
        try:
            images = convert_from_bytes(content)
            text = ""
            
            for image in images:
                page_text = pytesseract.image_to_string(image)
                text += page_text + "\n"
            
            return text
        except Exception as e:
            logger.error(f"OCR failed: {e}")
            return ""
    
    async def _extract_from_image(self, content: bytes) -> str:
        try:
            image = Image.open(io.BytesIO(content))
            return pytesseract.image_to_string(image)
        except Exception as e:
            logger.error(f"Image OCR failed: {e}")
            return ""
    
    async def _extract_from_docx(self, content: bytes) -> str:
        try:
            doc = docx.Document(io.BytesIO(content))
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            logger.error(f"DOCX extraction failed: {e}")
            return ""
    
    async def _extract_from_excel(self, content: bytes) -> str:
        try:
            df = pd.read_excel(io.BytesIO(content))
            return df.to_string()
        except Exception as e:
            logger.error(f"Excel extraction failed: {e}")
            return ""

In [12]:
# Vector Store Manager
class VectorStoreManager:
    
    def __init__(self):
        # Initialize embeddings
        if config.OPENAI_API_KEY:
            self.embeddings = OpenAIEmbeddings(openai_api_key=config.OPENAI_API_KEY)
        else:
            # Fallback to local embeddings
            self.embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2"
            )
        
        # Initialize vector store
        self.chroma_client = chromadb.PersistentClient(path=config.CHROMA_PERSIST_DIR)
        
    async def add_documents(self, documents: List[Document], doc_id: str) -> bool:
        try:
            # Create collection for this document
            collection_name = f"doc_{doc_id}"
            
            # Create Chroma vector store
            vectorstore = Chroma(
                client=self.chroma_client,
                collection_name=collection_name,
                embedding_function=self.embeddings
            )
            
            # Add documents
            await asyncio.get_event_loop().run_in_executor(
                None, vectorstore.add_documents, documents
            )
            
            return True
            
        except Exception as e:
            logger.error(f"Error adding documents to vector store: {e}")
            return False
    
    async def get_retriever(self, doc_id: str, k: int = 4):
        try:
            collection_name = f"doc_{doc_id}"
            
            vectorstore = Chroma(
                client=self.chroma_client,
                collection_name=collection_name,
                embedding_function=self.embeddings
            )
            
            return vectorstore.as_retriever(search_kwargs={"k": k})
            
        except Exception as e:
            logger.error(f"Error creating retriever: {e}")
            return None

In [14]:
# LLM Manager
class LLMManager:
    
    def __init__(self):
        self.models = {}
        
        # Initialize OpenAI
        if config.OPENAI_API_KEY:
            self.models['openai'] = ChatOpenAI(
                openai_api_key=config.OPENAI_API_KEY,
                model_name="gpt-3.5-turbo",
                temperature=0.7,
                max_tokens=config.MAX_TOKENS
            )
        
        # Initialize Anthropic
        if config.ANTHROPIC_API_KEY:
            self.models['anthropic'] = ChatAnthropic(
                anthropic_api_key=config.ANTHROPIC_API_KEY,
                model="claude-3-sonnet-20240229",
                max_tokens=config.MAX_TOKENS
            )
    
    def get_model(self, provider: str = "openai"):
        return self.models.get(provider, list(self.models.values())[0] if self.models else None)


In [15]:
# Main Chat Service
class ChatService:
    
    def __init__(self):
        self.db_manager = DatabaseManager()
        self.doc_processor = DocumentProcessor()
        self.vector_manager = VectorStoreManager()
        self.llm_manager = LLMManager()
        
    async def upload_document(self, file: UploadFile) -> str:
        # Generate document ID
        doc_id = str(uuid.uuid4())
        
        try:
            # Validate file size
            content = await file.read()
            if len(content) > config.MAX_FILE_SIZE:
                raise HTTPException(400, "File too large")
            
            # Reset file position
            await file.seek(0)
            
            # Process document
            documents, metadata = await self.doc_processor.process_document(file, doc_id)
            
            if metadata.processing_status == "failed":
                raise HTTPException(400, f"Document processing failed: {metadata.error_message}")
            
            # Save to vector store
            if documents:
                success = await self.vector_manager.add_documents(documents, doc_id)
                if not success:
                    raise HTTPException(500, "Failed to index document")
            
            # Save metadata to database
            await self.db_manager.save_document(doc_id, metadata)
            
            return doc_id
            
        except Exception as e:
            logger.error(f"Error uploading document: {e}")
            raise HTTPException(500, str(e))
    
    async def chat_with_document(self, query: QueryRequest) -> QueryResponse:
        try:
            # Get or create chat session
            session = None
            if query.session_id:
                session = await self.db_manager.get_chat_session(query.session_id)
            
            if not session:
                session = ChatSession(
                    document_id=query.document_id,
                    session_id=query.session_id or str(uuid.uuid4())
                )
            
            # Get retriever
            retriever = await self.vector_manager.get_retriever(query.document_id)
            if not retriever:
                raise HTTPException(404, "Document not found or not indexed")
            
            # Get LLM
            llm = self.llm_manager.get_model("openai")
            if not llm:
                raise HTTPException(500, "No LLM available")
            
            # Create conversation chain
            memory = ConversationBufferWindowMemory(
                memory_key="chat_history",
                return_messages=True,
                k=5  # Keep last 5 exchanges
            )
            
            # Load previous conversation
            for msg in session.messages[-10:]:  # Last 10 messages
                if msg.role == "user":
                    memory.chat_memory.add_user_message(msg.content)
                else:
                    memory.chat_memory.add_ai_message(msg.content)
            
            # Create chain
            qa_chain = ConversationalRetrievalChain.from_llm(
                llm=llm,
                retriever=retriever,
                memory=memory,
                return_source_documents=True,
                verbose=True
            )
            
            # Get answer
            result = await asyncio.get_event_loop().run_in_executor(
                None, qa_chain, {"question": query.question}
            )
            
            answer = result["answer"]
            source_docs = result.get("source_documents", [])
            
            # Extract sources
            sources = [
                f"{doc.metadata.get('file_name', 'Unknown')} (chunk {doc.metadata.get('chunk_index', 0)})"
                for doc in source_docs
            ]
            
            # Add messages to session
            session.messages.append(ChatMessage(role="user", content=query.question))
            session.messages.append(ChatMessage(
                role="assistant", 
                content=answer, 
                sources=sources
            ))
            
            # Save session
            await self.db_manager.save_chat_session(session)
            
            return QueryResponse(
                answer=answer,
                sources=sources,
                session_id=session.session_id
            )
            
        except Exception as e:
            logger.error(f"Error in chat: {e}")
            raise HTTPException(500, str(e))

In [17]:
# FastAPI Application
app = FastAPI(title="Chat with PDF API", version="1.0.0")
chat_service = ChatService()

@app.post("/upload", response_model=dict)
async def upload_document(file: UploadFile = File(...)):
    doc_id = await chat_service.upload_document(file)
    return {"document_id": doc_id, "status": "uploaded"}

@app.post("/chat", response_model=QueryResponse)
async def chat_with_document(query: QueryRequest):
    return await chat_service.chat_with_document(query)

@app.get("/document/{doc_id}", response_model=DocumentMetadata)
async def get_document_info(doc_id: str):
    metadata = await chat_service.db_manager.get_document(doc_id)
    if not metadata:
        raise HTTPException(404, "Document not found")
    return metadata

@app.get("/session/{session_id}", response_model=ChatSession)
async def get_chat_session(session_id: str):
    session = await chat_service.db_manager.get_chat_session(session_id)
    if not session:
        raise HTTPException(404, "Session not found")
    return session

@app.get("/health")
async def health_check():
    return {"status": "healthy"}

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [18]:
if __name__ == "__main__":
    uvicorn.run(
        "main:app",
        host="0.0.0.0",
        port=8000,
        reload=True,
        log_level="info"
    )

INFO:     Will watch for changes in these directories: ['d:\\Designing-Large-Language-Model-Applications\\Chapter 1']
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     Started reloader process [19936] using WatchFiles
INFO:     Stopping reloader process [19936]
