In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFDirectoryLoader, TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from qdrant_client import QdrantClient,models

from datasets import Dataset
from ragas.metrics import context_precision, answer_relevancy
from ragas import evaluate
from pathlib import Path
from dotenv import load_dotenv
import os
load_dotenv()

# Configuration
DATA_DIR = Path("./data")  # Path to your PDF data directory
QDRANT_HOST =  "http://localhost:6333"  # Local Qdrant host
QDRANT_COLLECTION = "law_docs"  # Qdrant collection name
OPENAI_API_KEY = "lm-studio"  # Your OpenAI API key
LLM_MODEL = os.getenv("LLM_MODEL", "text-davinci-003")  # OpenAI LLM model

# --- Data Loading and Preprocessing ---

def load_directory_pdf(directory_path):
    """Loads PDF files from a directory."""
    loader = PyPDFDirectoryLoader(directory_path)
    docs = loader.load()
    return docs

def text_splitter(document):
    """Splits documents into chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    texts = text_splitter.split_documents(document)
    return texts

# --- Qdrant Database Setup ---

def create_qdrant_collection(collection_name):
    """Creates a Qdrant collection if it doesn't exist."""
    qclient = QdrantClient(url=QDRANT_HOST)
    if qclient.collection_exists(collection_name=collection_name):
        print(f"Vector DB already exists: {collection_name}")
        return
    print(f"Vector DB Successfully created: {collection_name}")
    qclient.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
    )

# --- Embedding and Storing Documents in Qdrant ---

def embed_and_store(chunks):
    """Embeds text chunks and stores them in Qdrant."""
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    qdrant_client = QdrantClient(url=QDRANT_HOST)
    vectorstore = Qdrant(client=qdrant_client, collection_name=QDRANT_COLLECTION, embeddings=embeddings)
    vectorstore.add_documents(chunks)
    print("Embedding created successfully")

# --- Retrieval and LLM Interaction ---

def get_llm_response(query):
    """Retrieves relevant context and generates a response using LLM."""
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    qdrant_client = QdrantClient(url=QDRANT_HOST)
    vectorstore = Qdrant(client=qdrant_client, collection_name=QDRANT_COLLECTION, embeddings=embeddings)
    retriever = vectorstore.as_retriever()
    
    llm = OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0.7)
    prompt_template = PromptTemplate(
        template="You are a helpful and informative lawyer. Answer the following question based on the provided context: {question}\n\nContext: {context}",
        input_variables=["question", "context"],
    )
    qa = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=retriever, prompt=prompt_template
    )
    
    return qa.run(query)

# --- RAGAS Evaluation ---

def evaluate_rag(query, response, context, ground_truth):
    """Evaluates the RAG system using RAGAS metrics."""
    data_samples = {
        'question': [query],
        'answer': [response],
        'contexts': [[context]],
        'ground_truth': [ground_truth]
    }
    dataset = Dataset.from_dict(data_samples)
    score = evaluate(dataset, metrics=[context_precision, answer_relevancy])
    return score.to_pandas()

# --- Main Function ---

def main():
    # # Create Qdrant collection
    # create_qdrant_collection(QDRANT_COLLECTION)

    # # Load PDFs and split them
    # data = load_directory_pdf(DATA_DIR)
    # document = text_splitter(data)

    # # Embed and store chunks in Qdrant
    # embed_and_store(document)

    while True:
        # Get user query
        query = input("Enter your question: ")

        # Get response using RAG
        response = get_llm_response(query)
        print(f"LLM Response: {response}\n")

        # Evaluate RAG
        ground_truth = input("Enter ground truth for evaluation: ")  # Get ground truth from user
        evaluation_results = evaluate_rag(query, response, response['context'], ground_truth)
        print(f"Evaluation results: \n{evaluation_results}\n")

        # Ask if user wants to continue
        continue_query = input("Do you want to ask another question (y/n)? ")
        if continue_query.lower() != 'y':
            break

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Vector DB Successfully created: law_docs


  warn_deprecated(
  warn_deprecated(


Embedding created successfully


  warn_deprecated(


ValidationError: 1 validation error for RetrievalQA
prompt
  extra fields not permitted (type=value_error.extra)