In [None]:
!pip install fastapi uvicorn python-multipart
!pip install langchain langchain-community
!pip install sentence-transformers
!pip install faiss-cpu
!pip install PyMuPDF
!pip install python-dotenv
!pip install openai

In [None]:
# Advanced RAG Project with Groq API
# This notebook contains all the RAG functionality in a single file

import os
import io
import json
import asyncio
from pathlib import Path
from typing import List, Optional
import uvicorn
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import fitz  # PyMuPDF
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import openai
import nest_asyncio
import threading
import time
import warnings
warnings.filterwarnings("ignore")

class Config:
    def __init__(self):
        load_dotenv()
        self.GROQ_API_KEY = os.getenv("GROQ_API_KEY")
        if not self.GROQ_API_KEY:
            raise ValueError("GROQ_API_KEY not found in environment variables")
        
        self.VECTOR_STORE_PATH = "data/index"
        self.UPLOAD_PATH = "data/uploads"
        self.PROCESSED_PATH = "data/processed"
        self.MODEL_NAME = "llama3-8b-8192"  # Groq model
        self.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
        self.CHUNK_SIZE = 500
        self.CHUNK_OVERLAP = 50
        self.MAX_RETRIEVED_CHUNKS = 5
        
        # Create directories
        os.makedirs(self.UPLOAD_PATH, exist_ok=True)
        os.makedirs(self.PROCESSED_PATH, exist_ok=True)
        os.makedirs(self.VECTOR_STORE_PATH, exist_ok=True)

config = Config()

class PDFParser:
    @staticmethod
    def extract_text_from_pdf(pdf_path: str) -> str:
        """Extract text from PDF using PyMuPDF"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(doc.page_count):
                page = doc[page_num]
                page_text = page.get_text()
                
                # Basic preprocessing
                page_text = page_text.replace('\n\n', '\n')
                page_text = page_text.strip()
                
                if page_text:
                    text += page_text + "\n\n"
            
            doc.close()
            return text.strip()
        
        except Exception as e:
            raise Exception(f"Error extracting text from PDF: {str(e)}")

class TextChunker:
    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def chunk_text(self, text: str) -> List[str]:
        """Split text into chunks"""
        chunks = self.text_splitter.split_text(text)
        return [chunk.strip() for chunk in chunks if chunk.strip()]

class Embedder:
    def __init__(self, model_name: str):
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': 'cpu'}
        )
        self.vector_store = None
    
    def create_vector_store(self, chunks: List[str], pdf_filename: str) -> FAISS:
        """Create FAISS vector store from text chunks"""
        documents = [
            Document(
                page_content=chunk,
                metadata={"source": pdf_filename, "chunk_id": i}
            )
            for i, chunk in enumerate(chunks)
        ]
        
        self.vector_store = FAISS.from_documents(documents, self.embeddings)
        return self.vector_store
    
    def save_vector_store(self, path: str):
        """Save vector store to disk"""
        if self.vector_store:
            self.vector_store.save_local(path)
    
    def load_vector_store(self, path: str) -> Optional[FAISS]:
        """Load vector store from disk"""
        try:
            self.vector_store = FAISS.load_local(
                path, 
                self.embeddings,
                allow_dangerous_deserialization=True
            )
            return self.vector_store
        except:
            return None

class Retriever:
    def __init__(self, vector_store: FAISS, embeddings):
        self.vector_store = vector_store
        self.embeddings = embeddings
    
    def retrieve_similar_chunks(self, query: str, k: int = 5) -> List[Document]:
        """Retrieve k most similar chunks for the query"""
        try:
            docs = self.vector_store.similarity_search(query, k=k)
            return docs
        except Exception as e:
            print(f"Error in retrieval: {e}")
            return []

class LLMInterface:
    def __init__(self, api_key: str, model_name: str):
        self.client = openai.OpenAI(
            base_url="https://api.groq.com/openai/v1",
            api_key=api_key
        )
        self.model_name = model_name
    
    def generate_answer(self, context: str, question: str) -> str:
        """Generate answer using Groq API with hallucination prevention"""
        
        # Enhanced prompt to prevent hallucination
        prompt = f"""You are a helpful assistant that answers questions based STRICTLY on the provided context. 

IMPORTANT INSTRUCTIONS:
1. Only use information from the provided context to answer the question
2. If the answer is not in the context, say "I cannot find this information in the provided document"
3. Do not add information from your general knowledge
4. Be specific and cite relevant parts of the context when possible
5. If the context is unclear or insufficient, acknowledge this limitation

Context:
{context}

Question: {question}

Answer based solely on the context provided:"""

        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a precise document assistant that only answers based on provided context."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=512,
                temperature=0.1,  # Low temperature to reduce hallucination
                top_p=0.9,
                frequency_penalty=0.0,
                presence_penalty=0.0
            )
            return response.choices[0].message.content.strip()
        
        except Exception as e:
            return f"Error generating response: {str(e)}"

class RAGApplication:
    def __init__(self):
        self.config = config
        self.pdf_parser = PDFParser()
        self.chunker = TextChunker(
            chunk_size=self.config.CHUNK_SIZE,
            chunk_overlap=self.config.CHUNK_OVERLAP
        )
        self.embedder = Embedder(self.config.EMBEDDING_MODEL)
        self.llm = LLMInterface(self.config.GROQ_API_KEY, self.config.MODEL_NAME)
        self.current_retriever = None
        self.current_pdf_name = None
    
    async def process_pdf(self, pdf_file: UploadFile) -> dict:
        try:
            # Save uploaded file
            pdf_path = os.path.join(self.config.UPLOAD_PATH, pdf_file.filename)
            with open(pdf_path, "wb") as f:
                content = await pdf_file.read()
                f.write(content)
            
            # Extract text
            text = self.pdf_parser.extract_text_from_pdf(pdf_path)
            
            if not text.strip():
                raise Exception("No text found in PDF")
            
            # Chunk text
            chunks = self.chunker.chunk_text(text)
            
            if not chunks:
                raise Exception("No valid chunks created from PDF")
            
            # Create vector store
            vector_store = self.embedder.create_vector_store(chunks, pdf_file.filename)
            
            # Save vector store
            vector_store_path = os.path.join(self.config.VECTOR_STORE_PATH, "faiss_index")
            self.embedder.save_vector_store(vector_store_path)
            
            # Create retriever
            self.current_retriever = Retriever(vector_store, self.embedder.embeddings)
            self.current_pdf_name = pdf_file.filename
            
            return {
                "status": "success",
                "message": f"PDF '{pdf_file.filename}' processed successfully",
                "chunks_created": len(chunks),
                "text_length": len(text)
            }
        
        except Exception as e:
            return {
                "status": "error",
                "message": f"Error processing PDF: {str(e)}"
            }
    
    def query_document(self, question: str) -> dict:
        """Query the processed document"""
        try:
            if not self.current_retriever:
                return {
                    "status": "error",
                    "message": "No document has been processed yet. Please upload a PDF first."
                }
            
            # Retrieve similar chunks
            retrieved_docs = self.current_retriever.retrieve_similar_chunks(
                question, 
                k=self.config.MAX_RETRIEVED_CHUNKS
            )
            
            if not retrieved_docs:
                return {
                    "status": "error",
                    "message": "No relevant information found in the document."
                }
            
            # Prepare context
            context = "\n\n".join([doc.page_content for doc in retrieved_docs])
            
            # Generate answer
            answer = self.llm.generate_answer(context, question)
            
            return {
                "status": "success",
                "answer": answer,
                "sources": len(retrieved_docs),
                "document": self.current_pdf_name
            }
        
        except Exception as e:
            return {
                "status": "error",
                "message": f"Error querying document: {str(e)}"
            }

app = FastAPI(title="Advanced RAG PDF Query System", version="1.0.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize RAG application
rag_app = RAGApplication()

HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Advanced RAG PDF Query System</title>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }
        
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 20px;
        }
        
        .container {
            max-width: 1200px;
            margin: 0 auto;
            background: white;
            border-radius: 15px;
            box-shadow: 0 20px 40px rgba(0,0,0,0.1);
            overflow: hidden;
        }
        
        .header {
            background: linear-gradient(135deg, #2c3e50, #3498db);
            color: white;
            padding: 30px;
            text-align: center;
        }
        
        .header h1 {
            font-size: 2.5em;
            margin-bottom: 10px;
        }
        
        .header p {
            font-size: 1.1em;
            opacity: 0.9;
        }
        
        .main-content {
            padding: 40px;
        }
        
        .upload-section, .query-section {
            background: #f8f9fa;
            border-radius: 10px;
            padding: 30px;
            margin-bottom: 30px;
            border: 2px dashed #dee2e6;
            transition: all 0.3s ease;
        }
        
        .upload-section:hover, .query-section:hover {
            border-color: #3498db;
            transform: translateY(-2px);
        }
        
        .section-title {
            font-size: 1.5em;
            color: #2c3e50;
            margin-bottom: 20px;
            display: flex;
            align-items: center;
        }
        
        .section-title::before {
            content: "";
            margin-right: 10px;
        }
        
        .query-section .section-title::before {
            content: "";
        }
        
        .form-group {
            margin-bottom: 20px;
        }
        
        .form-group label {
            display: block;
            margin-bottom: 8px;
            font-weight: 600;
            color: #2c3e50;
        }
        
        .form-control {
            width: 100%;
            padding: 12px 16px;
            border: 2px solid #dee2e6;
            border-radius: 8px;
            font-size: 16px;
            transition: border-color 0.3s ease;
        }
        
        .form-control:focus {
            outline: none;
            border-color: #3498db;
        }
        
        .btn {
            background: linear-gradient(135deg, #3498db, #2980b9);
            color: white;
            padding: 12px 30px;
            border: none;
            border-radius: 8px;
            cursor: pointer;
            font-size: 16px;
            font-weight: 600;
            transition: all 0.3s ease;
        }
        
        .btn:hover {
            transform: translateY(-2px);
            box-shadow: 0 5px 15px rgba(52, 152, 219, 0.4);
        }
        
        .btn:disabled {
            background: #bdc3c7;
            cursor: not-allowed;
            transform: none;
        }
        
        .status-message {
            padding: 15px;
            border-radius: 8px;
            margin: 20px 0;
            font-weight: 500;
        }
        
        .success {
            background: #d4edda;
            color: #155724;
            border: 1px solid #c3e6cb;
        }
        
        .error {
            background: #f8d7da;
            color: #721c24;
            border: 1px solid #f5c6cb;
        }
        
        .answer-box {
            background: white;
            border: 2px solid #e9ecef;
            border-radius: 10px;
            padding: 25px;
            margin-top: 20px;
            font-size: 16px;
            line-height: 1.6;
            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
        }
        
        .loading {
            display: none;
            text-align: center;
            padding: 20px;
        }
        
        .loading::after {
            content: "";
            display: inline-block;
            width: 20px;
            height: 20px;
            border: 3px solid #f3f3f3;
            border-top: 3px solid #3498db;
            border-radius: 50%;
            animation: spin 1s linear infinite;
        }
        
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
        
        .file-info {
            background: #e3f2fd;
            border: 1px solid #bbdefb;
            border-radius: 8px;
            padding: 15px;
            margin-top: 15px;
            display: none;
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>Advanced RAG PDF Query System</h1>
            <p>Upload your PDF document and ask intelligent questions powered by Groq AI</p>
        </div>
        
        <div class="main-content">
            <!-- Upload Section -->
            <div class="upload-section">
                <h2 class="section-title">Upload PDF Document</h2>
                <form id="uploadForm" enctype="multipart/form-data">
                    <div class="form-group">
                        <label for="pdfFile">Select PDF File:</label>
                        <input type="file" id="pdfFile" name="file" accept=".pdf" class="form-control" required>
                    </div>
                    <button type="submit" id="uploadBtn" class="btn">Upload & Process PDF</button>
                </form>
                <div id="uploadStatus"></div>
                <div id="fileInfo" class="file-info"></div>
            </div>
            
            <!-- Query Section -->
            <div class="query-section">
                <h2 class="section-title">Ask Questions</h2>
                <form id="queryForm">
                    <div class="form-group">
                        <label for="question">Your Question:</label>
                        <input type="text" id="question" name="question" class="form-control" 
                               placeholder="Ask anything about your uploaded document..." required>
                    </div>
                    <button type="submit" id="queryBtn" class="btn" disabled>Ask Question</button>
                </form>
                <div id="queryStatus"></div>
                <div class="loading" id="loading">Processing your question...</div>
                <div id="answerBox" class="answer-box" style="display: none;"></div>
            </div>
        </div>
    </div>

    <script>
        let documentUploaded = false;
        
        // Upload Form Handler
        document.getElementById('uploadForm').addEventListener('submit', async function(e) {
            e.preventDefault();
            
            const formData = new FormData();
            const fileInput = document.getElementById('pdfFile');
            const file = fileInput.files[0];
            
            if (!file) {
                showMessage('uploadStatus', 'Please select a PDF file', 'error');
                return;
            }
            
            formData.append('file', file);
            
            const uploadBtn = document.getElementById('uploadBtn');
            uploadBtn.disabled = true;
            uploadBtn.textContent = 'Processing...';
            
            try {
                const response = await fetch('/upload', {
                    method: 'POST',
                    body: formData
                });
                
                const result = await response.json();
                
                if (result.status === 'success') {
                    showMessage('uploadStatus', result.message, 'success');
                    documentUploaded = true;
                    document.getElementById('queryBtn').disabled = false;
                    
                    // Show file info
                    const fileInfo = document.getElementById('fileInfo');
                    fileInfo.innerHTML = `
                        <strong>File processed successfully!</strong><br>
                        Chunks created: ${result.chunks_created}<br>
                        Text length: ${result.text_length} characters
                    `;
                    fileInfo.style.display = 'block';
                } else {
                    showMessage('uploadStatus', result.message, 'error');
                }
            } catch (error) {
                showMessage('uploadStatus', 'Error uploading file: ' + error.message, 'error');
            }
            
            uploadBtn.disabled = false;
            uploadBtn.textContent = 'Upload & Process PDF';
        });
        
        // Query Form Handler
        document.getElementById('queryForm').addEventListener('submit', async function(e) {
            e.preventDefault();
            
            if (!documentUploaded) {
                showMessage('queryStatus', 'Please upload a PDF first', 'error');
                return;
            }
            
            const question = document.getElementById('question').value;
            const queryBtn = document.getElementById('queryBtn');
            const loading = document.getElementById('loading');
            const answerBox = document.getElementById('answerBox');
            
            queryBtn.disabled = true;
            loading.style.display = 'block';
            answerBox.style.display = 'none';
            
            try {
                const response = await fetch('/query', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/x-www-form-urlencoded',
                    },
                    body: `question=${encodeURIComponent(question)}`
                });
                
                const result = await response.json();
                
                if (result.status === 'success') {
                    answerBox.innerHTML = `
                        <h3>Answer:</h3>
                        <p>${result.answer}</p>
                        <hr style="margin: 15px 0;">
                        <small><strong>Sources:</strong> ${result.sources} relevant chunks from ${result.document}</small>
                    `;
                    answerBox.style.display = 'block';
                    showMessage('queryStatus', '', '');
                } else {
                    showMessage('queryStatus', result.message, 'error');
                }
            } catch (error) {
                showMessage('queryStatus', 'Error processing query: ' + error.message, 'error');
            }
            
            loading.style.display = 'none';
            queryBtn.disabled = false;
        });
        
        function showMessage(elementId, message, type) {
            const element = document.getElementById(elementId);
            if (message) {
                element.innerHTML = `<div class="status-message ${type}">${message}</div>`;
            } else {
                element.innerHTML = '';
            }
        }
    </script>
</body>
</html>
"""

@app.get("/", response_class=HTMLResponse)
async def home():
    """Serve the main HTML interface"""
    return HTML_TEMPLATE

@app.post("/upload")
async def upload_pdf(file: UploadFile = File(...)):
    """Upload and process PDF file"""
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
    
    result = await rag_app.process_pdf(file)
    return JSONResponse(content=result)

@app.post("/query")
async def query_document(question: str = Form(...)):
    """Query the processed document"""
    result = rag_app.query_document(question)
    return JSONResponse(content=result)

@app.get("/health")
async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "message": "RAG system is running"}


# Enable nested event loops for Jupyter
nest_asyncio.apply()

def run_server():
    """Run the FastAPI server in a separate thread"""
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=8000,
        reload=False,
        log_level="info"
    )

# Start the server
print("Starting Advanced RAG PDF Query System...")
print("Using Groq API with model: llama3-8b-8192")
print("Server will be available at: http://localhost:8000")
print("Make sure to set your GROQ_API_KEY in the .env file")
print("Starting server in background...")

# Run server in a separate thread
server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()

# Give the server time to start
time.sleep(3)
print("Server started successfully!")
print("Open your browser and go to: http://localhost:8000")
print("Upload a PDF and start asking questions!")
print("To stop the server, restart the kernel or interrupt the notebook")