In [None]:
import os  # For interacting with the operating system (e.g., file paths)
import re  # For regular expression operations
import pdfplumber  # For extracting text from PDF files
from dotenv import load_dotenv  # For loading environment variables from a `.env` file

In [None]:
# LangChain Core Libraries
from langchain.schema import Document  # Document schema for managing structured text data
from langchain.embeddings import OpenAIEmbeddings  # Embeddings using OpenAI models
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Text splitting into chunks

# LangChain Community Libraries
from langchain_community.vectorstores import FAISS  # Vector store for semantic search using FAISS
from langchain_core.documents import Document  # Another Document schema (to avoid duplication, remove one)
from langchain_core.output_parsers import StrOutputParser  # Converts outputs to strings
from langchain_core.runnables import RunnablePassthrough  # Pass-through for inputs in chains
from langchain_core.prompts import PromptTemplate  # For creating prompt templates for models

# LangChain OpenAI-Specific Libraries
from langchain_openai import OpenAIEmbeddings, ChatOpenAI  # OpenAI Embeddings and Chat API wrapper

In [None]:
# RAGAS Core Libraries
from ragas import evaluate  # Main evaluation function for RAGAS
from ragas.llms import LangchainLLMWrapper  # Wrapper for LLMs to ensure compatibility with RAGAS
from ragas.embeddings import LangchainEmbeddingsWrapper  # Wrapper for embeddings compatibility
from ragas.testset.generator import TestsetGenerator  # Testset generator for creating question-answer pairs
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional  # Question type strategies
from ragas.testset.extractor import KeyphraseExtractor  # Extracts key phrases from documents
from ragas.testset.docstore import InMemoryDocumentStore  # Stores documents in memory for fast access
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision  # Evaluation metrics

from datasets import Dataset  # Hugging Face library for dataset manipulation and handling

In [None]:
# Load environment variables
load_dotenv()

# ðŸ”¹ Global Variable Declaration
documents = []  # Used as a global list

In [None]:
# ðŸ”¹ Functions Used

# ðŸ”¹ Function to List Files in a Folder
def get_filenames_in_folder(folder_path):
    """Returns a list of all file names in the specified folder."""
    try:
        filenames = [file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
        return filenames  # Return the list of file names
    except Exception as e:
        print(f"Error occurred: {e}")
        return []

# ðŸ”¹ Function to Split PDF Files by Page
def chunk_pdf_with_pdfplumber(file_path, start_page=1, end_page=-1):
    """Splits a PDF into chunks based on a specific page range and converts them into Document objects."""
    chunks = []
    with pdfplumber.open(file_path) as pdf:
        total_pages = len(pdf.pages)  # Get the total number of pages in the PDF
        # Adjust end_page if it is negative
        if end_page < 0:
            end_page = total_pages + end_page + 1  # Calculate page count from the end
        
        # Process pages within the specified range
        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            text = page.extract_text()
            if text:
                # Clean the text (remove unnecessary characters)
                cleaned_text = re.sub(r'\n|\r|\t', ' ', text)  # Remove escape characters
                cleaned_text = re.sub(r'í‘œ<\d+-\d+>', '', cleaned_text)  # Remove '<number-number>' patterns
                cleaned_text = re.sub(r'â–¡| |â—‹', '', cleaned_text)  # Remove 'â–¡', ' ', 'â—‹'
                cleaned_text = re.sub(r'<(ê·¸ë¦¼|í‘œ) \d+-\d+>', '', cleaned_text)  # Remove '<figure number-number>' patterns
                
                chunks.append({
                    "page_content": cleaned_text.strip(),
                    "metadata": {
                        "source_type": "pdf",
                        "file_name": file_path,
                        "page_number": page_num + 1
                    }
                })
    
    return chunks

# ðŸ”¹ Function to Add a New PDF File and Accumulate into Documents (Allow Duplicates)
def add_pdf_to_documents(file_path, start_page=1, end_page=-1):
    """Reads a PDF file, converts it into Document objects, and appends to the global documents list (duplicates allowed)."""
    global documents  # Use a global variable
    chunk_dicts = chunk_pdf_with_pdfplumber(file_path, start_page, end_page)
    
    # ðŸ”¹ Convert each page of the PDF into a Document and append (duplicates allowed)
    new_documents = [
        Document(page_content=chunk["page_content"], metadata=chunk["metadata"])
        for chunk in chunk_dicts
    ]
    
    documents.extend(new_documents)  # Append new documents to the existing documents
    print(f"âœ… {len(new_documents)} chunks from the PDF '{file_path}' have been added to documents.")
    print(f"ðŸ“‚ Total number of Documents: {len(documents)}")

    return new_documents

In [None]:
# ðŸ”¹ Add PDF Files (Duplicates Allowed)
file_list = get_filenames_in_folder('./pdfs')

for file in file_list:
    file_path = f"./pdfs/{file}"  # Path to the PDF file

    # Add the PDF file twice (allowing duplicates)
    add_pdf_to_documents(file_path, start_page=3, end_page=-2)

    # Create a vector database
    vectorstore = FAISS.from_documents(documents=documents, embedding=OpenAIEmbeddings())

    # Save the database locally
    vectorstore.save_local('./db/faiss')

In [None]:
# ðŸ”¹ RAG-based Test Set Generation Pipeline

# Generator
generator_llm = ChatOpenAI(model='gpt-4o')

# Critic
critic_llm = ChatOpenAI(model='gpt-4o')

# Embedding Model
embeddings = OpenAIEmbeddings()

# Wrapper for Embedding Model to Ensure Compatibility with RAGAS
ragas_embeddings = LangchainEmbeddingsWrapper(embeddings)

# Model for Creating Keyphrase Extractor (Wrapper for RAGAS Compatibility)
langchain_llm = LangchainLLMWrapper(ChatOpenAI(model='gpt-4o'))

# Keyphrase Extractor: Identifies and Extracts Key Information from Documents
Keyphrase_extractor = KeyphraseExtractor(llm=langchain_llm)

# Chunking and Overlap Configuration for PDF Processing
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)

# In-Memory Document Store Configuration
docstore = InMemoryDocumentStore(
    splitter=splitter,
    embeddings=ragas_embeddings,
    extractor=Keyphrase_extractor
)

# Generator Creation (Generates and Evaluates Simultaneously)
# Generator Configuration with Four Components
generator = TestsetGenerator.from_langchain(
    generator_llm,  # Generator
    critic_llm,     # Critic
    ragas_embeddings,  # Embedding Model
    docstore=docstore  # Document Store
)

# Distribution of Question Types
distributions = {
    simple: 0.4,  # Questions with a single clear answer
    reasoning: 0.2,  # Questions requiring reasoning based on multiple clues
    multi_context: 0.2,  # Questions requiring understanding multiple contexts
    conditional: 0.2  # Conditional questions, requiring specific conditions
}

# ðŸ”¹ Generate Test Set
testset = generator.generate_with_langchain_docs(
    documents=documents,  # âœ… Pass the document list
    test_size=20,  # Number of question-answer sets to generate
    distributions=distributions,  # Difficulty distribution of questions
    with_debugging_logs=True,  # Enable debugging logs
    raise_exceptions=False  # Do not halt on exceptions
)

In [None]:
# ðŸ”¹ RAG-Based Question Answering Evaluation Pipeline

test_df = testset.to_pandas()
test_dataset = Dataset.from_pandas(test_df)

# Retriever
retriever = vectorstore.as_retriever()

# Prompt Template for Answering Questions
prompt = PromptTemplate.from_template(
    """You are an AI designed to answer questions using the given context. 
    Answer in the appropriate language for the context.
    If you don't know the answer, respond with 'I don't know.'
    
    # Context: {context}
    # Question: {question}
    # Answer:
    """
)

# LLM for Question Answering
llm = ChatOpenAI(model='gpt-4o', temperature=0)

# Chain to Handle Context Retrieval, Question Answering, and Output Parsing
chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Batch Processing of Questions
batch_dataset = []
for question in test_dataset['question']:
    batch_dataset.append(question)

answer = chain.batch(batch_dataset)

# Add or Update 'answer' Column in Test Dataset
if 'answer' in test_dataset.column_names:
    test_dataset = test_dataset.remove_columns(['answer']).add_column('answer', answer)
else:
    test_dataset = test_dataset.add_column('answer', answer)

# Evaluate Model Performance
result = evaluate(
    dataset=test_dataset,
    metrics=[
        context_recall,    # How well the model recalls important context information
        faithfulness,      # Factual correctness of the model's answer based on context
        answer_relevancy,  # Relevance of the model's answer to the overall question
        context_precision  # Precision in using necessary context information
    ]
)

# Convert Evaluation Results to DataFrame
result_df = result.to_pandas()

In [None]:
# result
result_df