In [None]:
!pip install fastapi uvicorn python-multipart
!pip install langchain langchain-community
!pip install sentence-transformers
!pip install faiss-cpu
!pip install PyMuPDF
!pip install python-dotenv
!pip install openai

In [1]:
# Advanced RAG Backend with Groq API
# This notebook contains the RAG backend functionality without frontend

import os
import io
import json
import asyncio
from pathlib import Path
from typing import List, Optional
import uvicorn
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import fitz  # PyMuPDF
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import openai
import nest_asyncio
import threading
import time
import warnings
warnings.filterwarnings("ignore")

class Config:
    def __init__(self):
        load_dotenv()
        self.GROQ_API_KEY = os.getenv("GROQ_API_KEY")
        if not self.GROQ_API_KEY:
            raise ValueError("GROQ_API_KEY not found in environment variables")
        
        self.VECTOR_STORE_PATH = "data/index"
        self.UPLOAD_PATH = "data/uploads"
        self.PROCESSED_PATH = "data/processed"
        self.MODEL_NAME = "llama3-8b-8192"  # Groq model
        self.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
        self.CHUNK_SIZE = 500
        self.CHUNK_OVERLAP = 50
        self.MAX_RETRIEVED_CHUNKS = 5
        
        # Create directories
        os.makedirs(self.UPLOAD_PATH, exist_ok=True)
        os.makedirs(self.PROCESSED_PATH, exist_ok=True)
        os.makedirs(self.VECTOR_STORE_PATH, exist_ok=True)

config = Config()

class PDFParser:
    @staticmethod
    def extract_text_from_pdf(pdf_path: str) -> str:
        """Extract text from PDF using PyMuPDF"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(doc.page_count):
                page = doc[page_num]
                page_text = page.get_text()
                
                # Basic preprocessing
                page_text = page_text.replace('\n\n', '\n')
                page_text = page_text.strip()
                
                if page_text:
                    text += page_text + "\n\n"
            
            doc.close()
            return text.strip()
        
        except Exception as e:
            raise Exception(f"Error extracting text from PDF: {str(e)}")

class TextChunker:
    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def chunk_text(self, text: str) -> List[str]:
        """Split text into chunks"""
        chunks = self.text_splitter.split_text(text)
        return [chunk.strip() for chunk in chunks if chunk.strip()]

class Embedder:
    def __init__(self, model_name: str):
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': 'cpu'}
        )
        self.vector_store = None
    
    def create_vector_store(self, chunks: List[str], pdf_filename: str) -> FAISS:
        """Create FAISS vector store from text chunks"""
        documents = [
            Document(
                page_content=chunk,
                metadata={"source": pdf_filename, "chunk_id": i}
            )
            for i, chunk in enumerate(chunks)
        ]
        
        self.vector_store = FAISS.from_documents(documents, self.embeddings)
        return self.vector_store
    
    def save_vector_store(self, path: str):
        """Save vector store to disk"""
        if self.vector_store:
            self.vector_store.save_local(path)
    
    def load_vector_store(self, path: str) -> Optional[FAISS]:
        """Load vector store from disk"""
        try:
            self.vector_store = FAISS.load_local(
                path, 
                self.embeddings,
                allow_dangerous_deserialization=True
            )
            return self.vector_store
        except:
            return None

class Retriever:
    def __init__(self, vector_store: FAISS, embeddings):
        self.vector_store = vector_store
        self.embeddings = embeddings
    
    def retrieve_similar_chunks(self, query: str, k: int = 5) -> List[Document]:
        """Retrieve k most similar chunks for the query"""
        try:
            docs = self.vector_store.similarity_search(query, k=k)
            return docs
        except Exception as e:
            print(f"Error in retrieval: {e}")
            return []

class LLMInterface:
    def __init__(self, api_key: str, model_name: str):
        self.client = openai.OpenAI(
            base_url="https://api.groq.com/openai/v1",
            api_key=api_key
        )
        self.model_name = model_name
    
    def generate_answer(self, context: str, question: str) -> str:
        """Generate answer using Groq API with hallucination prevention"""
        
        # Enhanced prompt to prevent hallucination
        prompt = f"""You are a helpful assistant that answers questions based STRICTLY on the provided context. 

IMPORTANT INSTRUCTIONS:
1. Only use information from the provided context to answer the question
2. If the answer is not in the context, say "I cannot find this information in the provided document"
3. Do not add information from your general knowledge
4. Be specific and cite relevant parts of the context when possible
5. If the context is unclear or insufficient, acknowledge this limitation

Context:
{context}

Question: {question}

Answer based solely on the context provided:"""

        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a precise document assistant that only answers based on provided context."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=512,
                temperature=0.1,  # Low temperature to reduce hallucination
                top_p=0.9,
                frequency_penalty=0.0,
                presence_penalty=0.0
            )
            return response.choices[0].message.content.strip()
        
        except Exception as e:
            return f"Error generating response: {str(e)}"

class RAGApplication:
    def __init__(self):
        self.config = config
        self.pdf_parser = PDFParser()
        self.chunker = TextChunker(
            chunk_size=self.config.CHUNK_SIZE,
            chunk_overlap=self.config.CHUNK_OVERLAP
        )
        self.embedder = Embedder(self.config.EMBEDDING_MODEL)
        self.llm = LLMInterface(self.config.GROQ_API_KEY, self.config.MODEL_NAME)
        self.current_retriever = None
        self.current_pdf_name = None
    
    async def process_pdf(self, pdf_file: UploadFile) -> dict:
        try:
            # Save uploaded file
            pdf_path = os.path.join(self.config.UPLOAD_PATH, pdf_file.filename)
            with open(pdf_path, "wb") as f:
                content = await pdf_file.read()
                f.write(content)
            
            # Extract text
            text = self.pdf_parser.extract_text_from_pdf(pdf_path)
            
            if not text.strip():
                raise Exception("No text found in PDF")
            
            # Chunk text
            chunks = self.chunker.chunk_text(text)
            
            if not chunks:
                raise Exception("No valid chunks created from PDF")
            
            # Create vector store
            vector_store = self.embedder.create_vector_store(chunks, pdf_file.filename)
            
            # Save vector store
            vector_store_path = os.path.join(self.config.VECTOR_STORE_PATH, "faiss_index")
            self.embedder.save_vector_store(vector_store_path)
            
            # Create retriever
            self.current_retriever = Retriever(vector_store, self.embedder.embeddings)
            self.current_pdf_name = pdf_file.filename
            
            return {
                "status": "success",
                "message": f"PDF '{pdf_file.filename}' processed successfully",
                "chunks_created": len(chunks),
                "text_length": len(text)
            }
        
        except Exception as e:
            return {
                "status": "error",
                "message": f"Error processing PDF: {str(e)}"
            }
    
    def query_document(self, question: str) -> dict:
        """Query the processed document"""
        try:
            if not self.current_retriever:
                return {
                    "status": "error",
                    "message": "No document has been processed yet. Please upload a PDF first."
                }
            
            # Retrieve similar chunks
            retrieved_docs = self.current_retriever.retrieve_similar_chunks(
                question, 
                k=self.config.MAX_RETRIEVED_CHUNKS
            )
            
            if not retrieved_docs:
                return {
                    "status": "error",
                    "message": "No relevant information found in the document."
                }
            
            # Prepare context
            context = "\n\n".join([doc.page_content for doc in retrieved_docs])
            
            # Generate answer
            answer = self.llm.generate_answer(context, question)
            
            return {
                "status": "success",
                "answer": answer,
                "sources": len(retrieved_docs),
                "document": self.current_pdf_name
            }
        
        except Exception as e:
            return {
                "status": "error",
                "message": f"Error querying document: {str(e)}"
            }

# Initialize FastAPI app
app = FastAPI(title="Advanced RAG PDF Query System", version="1.0.0")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Mount static files for frontend
app.mount("/static", StaticFiles(directory="../frontend"), name="static")

# Initialize RAG application
rag_app = RAGApplication()

@app.post("/api/upload")
async def upload_pdf(file: UploadFile = File(...)):
    """Upload and process PDF file"""
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
    
    result = await rag_app.process_pdf(file)
    return JSONResponse(content=result)

@app.post("/api/query")
async def query_document(question: str = Form(...)):
    """Query the processed document"""
    result = rag_app.query_document(question)
    return JSONResponse(content=result)

@app.get("/api/health")
async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "message": "RAG system is running"}

@app.get("/api/status")
async def get_status():
    """Get current system status"""
    return {
        "document_loaded": rag_app.current_pdf_name is not None,
        "current_document": rag_app.current_pdf_name,
        "model": config.MODEL_NAME,
        "embedding_model": config.EMBEDDING_MODEL
    }

# Enable nested event loops for Jupyter
nest_asyncio.apply()

def run_server():
    """Run the FastAPI server in a separate thread"""
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=8000,
        reload=False,
        log_level="info"
    )

# Start the server
print("Starting Advanced RAG PDF Query System...")
print("Using Groq API with model: llama3-8b-8192")
print("Server will be available at: http://localhost:8000")
print("Make sure to set your GROQ_API_KEY in the .env file")
print("Starting server in background...")

# Run server in a separate thread
server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()

# Give the server time to start
time.sleep(3)
print("Server started successfully!")
print("Backend API endpoints available at: http://localhost:8000/api/")
print("Open your frontend at: http://localhost:8000/static/")
print("To stop the server, restart the kernel or interrupt the notebook")

INFO:     Started server process [17832]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Starting Advanced RAG PDF Query System...
Using Groq API with model: llama3-8b-8192
Server will be available at: http://localhost:8000
Make sure to set your GROQ_API_KEY in the .env file
Starting server in background...
Server started successfully!
Backend API endpoints available at: http://localhost:8000/api/
Open your frontend at: http://localhost:8000/static/
To stop the server, restart the kernel or interrupt the notebook


INFO:     127.0.0.1:53313 - "GET / HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:53314 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:53319 - "GET /api/status HTTP/1.1" 200 OK
INFO:     127.0.0.1:53337 - "GET /api/health HTTP/1.1" 200 OK
INFO:     127.0.0.1:53339 - "GET /api/status HTTP/1.1" 200 OK
