In [1]:
from dotenv import load_dotenv, find_dotenv
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader

print(load_dotenv(find_dotenv()))


loader = TextLoader("./bella_vista.txt")
docs = loader.load()

print(docs)
print(len(docs))


True
[Document(metadata={'source': './bella_vista.txt'}, page_content="Q: What are the hours of operation for Bella Vista?\nA: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome guests from 12 p.m. to 10 p.m.\n\nQ: What type of cuisine does Bella Vista serve?\nA: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.\n\nQ: Do you offer vegetarian or vegan options at Bella Vista?\nA: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and vegan dishes. Our chefs are also happy to customize dishes based on dietary needs.\n\nQ: Is Bella Vista family-friendly?\nA: Yes, Bella Vista is a family-friendly establishment. We have a dedicated kids' menu and offer high chairs and booster seats for our younger guests.\n\nQ: Can I book private events at Bella Vista?\nA: Certainly! Bella Vista has a private

In [5]:
from pathlib import Path
import time
import concurrent.futures
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
import concurrent.futures
import time
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Assume these functions are already defined:
# - extract_text_from_pdf: Extracts text from a PDF file.
# - split_text_into_chunks: Splits text into smaller chunks.
# - parse_output: Parses the LLM's output to extract the summary, key findings, and tags.

def extract_text_from_pdf(pdf_path: Path) -> str:
    with pdf_path.open("rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def split_text_into_chunks(text, chunk_size=1500, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

def parse_output(output):
    # Print the raw output for debugging
    print("Raw Output:\n", output)
    
    # Initialize variables
    summary, key_findings, tags = "", "", ""
    current_section = None
    
    # Split the output by lines
    sections = output.split("\n")
    
    # Iterate over each line to determine the current section
    for line in sections:
        # Normalize the line by removing leading/trailing whitespace and converting to lowercase
        normalized_line = line.strip().lower()

        # Check for variations of "summary" section
        if "summary" in normalized_line:
            current_section = "summary"
            # Check if there is text after the colon
            if ":" in line:
                after_colon = line.split(":", 1)[1].strip()
                if after_colon:  # If there is text after the colon, add it to the summary
                    summary += after_colon + " "
            continue  # Skip to the next line
        
        # Check for variations of "key findings" section
        elif "key findings" in normalized_line:
            current_section = "key_findings"
            # Check if there is text after the colon
            if ":" in line:
                after_colon = line.split(":", 1)[1].strip()
                if after_colon:  # If there is text after the colon, add it to the key findings
                    key_findings += after_colon + " "
            continue  # Skip to the next line
        
        # Check for variations of "tags" section
        elif "tags" in normalized_line:
            current_section = "tags"
            # Check if there is text after the colon
            if ":" in line:
                after_colon = line.split(":", 1)[1].strip()
                if after_colon:  # If there is text after the colon, add it to the tags
                    tags += after_colon + " "
            continue  # Skip to the next line
        
        # Append content based on the current section
        if current_section == "summary":
            summary += line.strip() + " "
        elif current_section == "key_findings":
            key_findings += line.strip() + " "
        elif current_section == "tags":
            tags += line.strip() + " "
    
    # Clean and return the parsed results
    summary = summary.strip()
    key_findings = key_findings.strip()
    tags = [tag.strip() for tag in tags.split(",") if tag.strip()]

    # Return an error if either summary or key findings are empty
    if not summary or not key_findings:
        raise ValueError("Error: Empty summary or key findings for a chunk.")
    
    return summary, key_findings, tags

# Step 1: Initialize LLM and Prompts
llm = ChatOpenAI(model_name="gpt-4o")

# Create a prompt for summarization and key findings
summary_prompt_template = PromptTemplate(
    template=(
        "You are an expert analyst. Based on the following text, perform the following tasks:\n"
        "1. Summarize the content.\n"
        "2. List key findings.\n\n"
        "Text:\n{text}"
    ),
    input_variables=["text"]
)

# Create a prompt for extracting tags
tag_prompt_template = PromptTemplate(
    template="Provide a list of relevant tags for the following text:\n{text}",
    input_variables=["text"]
)

# Step 2: Extract text from the PDF
pdf_path = Path("D:/articles/Rag/RagOptimization.pdf")
pdf_text = extract_text_from_pdf(pdf_path)

# Step 3: Send entire text to LLM for summarization and key findings
summary_chain = LLMChain(llm=llm, prompt=summary_prompt_template)
summary_output = summary_chain.run({"text": pdf_text})

# Parse the LLM output to get the summary and key findings
summary, key_findings, _ = parse_output(summary_output)

# Print the final summary and key findings
print("\nFinal Summary of the Document:")
print(summary)

print("\nFinal Key Findings of the Document:")
print(key_findings)

# Step 4: Split text into chunks for tag extraction and vector store preparation
chunks = split_text_into_chunks(pdf_text)

# Create a chain for tag extraction
tag_chain = LLMChain(llm=llm, prompt=tag_prompt_template)

# Initialize metadata for each chunk
file_name = pdf_path.name
date = "2024-10-15"  # You can dynamically get the current date if needed

# Step 5: Process each chunk to extract tags and create documents for vector storage
documents = []

def process_chunk_for_tags(chunk, file_name, date):
    # Get tags for the current chunk
    tag_output = tag_chain.run({"text": chunk})
    time.sleep(0.250)
    # Parse the tags and update metadata
    tags = [tag.strip() for tag in tag_output.split(",") if tag.strip()]
    # Create metadata for the document
    metadata = {
        "file_name": file_name,
        "date": date,
        "tags": tags
    }
    # Create a new Document with the updated metadata
    document = Document(page_content=chunk, metadata=metadata)
    return document

# Execute processing in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    documents = list(executor.map(lambda chunk: process_chunk_for_tags(chunk, file_name, date), chunks))

# Step 6: Print documents with updated metadata for verification
print("\nProcessed Documents with Updated Metadata:")
for idx, doc in enumerate(documents):
    print(f"Document {idx + 1}:")
    print(f"Content: {doc.page_content[:100]}...")  # Show a preview of the content
    print(f"Metadata: {doc.metadata}")
    print("\n")

# Now, `documents` can be stored in the vector store

embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002")
texts = [doc.page_content for doc in documents]  # Extract text content
metadatas = [doc.metadata for doc in documents]  # Extract metadata

vectorstore = FAISS.from_documents(documents, embedding_function)
vectorstore.save_local("index")

# Later on, you can reload the vector store without needing to re-embed the documents
vectorstore = FAISS.load_local("index", embedding_function, allow_dangerous_deserialization=True)


query = "What are best practices for retrieval-augmented generation?"
query_embedding = embedding_function.embed_query(query)

retrieved_docs = vectorstore.similarity_search_by_vector(query_embedding, k=5)


for idx, doc in enumerate(retrieved_docs, 1):
    print(f"Result {idx}:")
    print("Text:", doc.page_content)
    #print("Metadata:", doc.metadata)
    print("\n")



Raw Output:
 ### Summary

The paper examines Retrieval-Augmented Generation (RAG) techniques, which enhance large language models (LLMs) by integrating retrieval mechanisms to incorporate up-to-date information, mitigate hallucinations, and improve response quality. RAG workflows are complex, involving multiple components like query classification, retrieval, reranking, repacking, and summarization, each with various implementation options. The study conducts extensive experiments to identify optimal RAG practices, focusing on balancing performance and efficiency. It also explores multimodal retrieval techniques to enhance question-answering capabilities for visual inputs. The authors propose strategies for deploying RAG systems effectively and provide resources for further exploration.

### Key Findings

1. **Complexity of RAG Workflows**: RAG involves multiple components with various implementation options, making it complex and challenging to optimize.

2. **Optimal Practices for RA

In [11]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv, find_dotenv
from keybert import KeyBERT
from bertopic import BERTopic
from sklearn.cluster import KMeans
from langchain.chat_models import ChatOpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd
from collections import defaultdict
import re
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


print(load_dotenv(find_dotenv()))
embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002")

# Later on, you can reload the vector store without needing to re-embed the documents
vectorstore = FAISS.load_local("index", embedding_function, allow_dangerous_deserialization=True)
llm = ChatOpenAI(model_name="gpt-4o")

query = "What are best practices for retrieval-augmented generation?"
query_embedding = embedding_function.embed_query(query)

retrieved_docs = vectorstore.similarity_search_by_vector(query_embedding, k=20)

combined_text = "\n\n".join([doc.page_content for doc in retrieved_docs])



structured_topic_prompt_template = PromptTemplate(
    template="Identify the main topics in the following text and output them in JSON format with fields 'title' and 'description':\n\nText:\n{text}",
    input_variables=["text"]
)

# Initialize LLM chain with structured prompt
structured_topic_chain = LLMChain(llm=llm, prompt=structured_topic_prompt_template)

# Run the LLM to get structured JSON output
extracted_topics_json = structured_topic_chain.run({"text": combined_text})

# Display raw JSON output for verification
#print(f"Structured JSON Output:\n{extracted_topics_json}")
cleaned_json_output = extracted_topics_json.strip("```json").strip("```")
#print(f"Cleaned JSON:\n{cleaned_json_output}")
import json

# Parse JSON output
try:
    extracted_topics_dict = json.loads(cleaned_json_output)
    if isinstance(extracted_topics_dict, list):
        # Extract topics assuming it is a list of dictionaries
        llm_topics = [(topic.get('title', 'Unknown Title'), topic.get('description', 'No Description')) for topic in extracted_topics_dict]
    else:
        print("Unexpected JSON format. Expected a list of dictionaries.")
        llm_topics = []
except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
    llm_topics = []


# Check if the JSON structure is correct
if isinstance(extracted_topics_dict, list):
    # Extract topics as a list of dictionaries
    llm_topics = [(topic['title'], topic['description']) for topic in extracted_topics_dict if 'title' in topic and 'description' in topic]
else:
    print("Unexpected JSON format. Please check the output.")

# Print parsed topics to verify
#print("LLM Topics Extracted:", llm_topics)

# Extract only topic titles for vector store searches
llm_topic_titles = [topic[0] for topic in llm_topics if topic]  # Ensure no empty tuples are added

print("Combined Topics:", llm_topic_titles)



def search_vectorstore_for_topic(topic, vectorstore, top_k=10):
    """
    Searches the vector store for the most relevant documents related to a given topic.
    
    Args:
    - topic (str): The topic to search for.
    - vectorstore (FAISS): The vector store object.
    - top_k (int): The number of top documents to retrieve.

    Returns:
    - List of Document objects most relevant to the topic.
    """
    # Generate embedding for the topic
    topic_embedding = embedding_function.embed_query(topic)
    
    # Retrieve top_k relevant documents for each topic
    retrieved_docs = vectorstore.similarity_search_by_vector(topic_embedding, k=top_k)
    return retrieved_docs


# Dictionary to hold retrieved documents for each topic
topic_documents = {}

for topic in llm_topic_titles:
    retrieved_docs = search_vectorstore_for_topic(topic, vectorstore)
    topic_documents[topic] = retrieved_docs
 
 
# Initialize a list to gather the final output text
final_combined_text = []

# Step 1: Add the main retrieved combined text to the final output
#final_combined_text.append(combined_text)

# Step 2: Add content for each topic in topic_documents
for topic, docs in topic_documents.items():
    final_combined_text.append(f"\n\n## Topic: {topic}\n")
    topic_content = "\n\n".join([doc.page_content for doc in docs])
    final_combined_text.append(topic_content)

# Combine all parts into a single string
final_document = "\n".join(final_combined_text)

# Display the final combined document
#print("Final Combined Document:")
#print(final_document)





coherence_prompt_template = PromptTemplate(
    template="Make the following text into a cohesive and well-organized document without removing any details:\n\nText:\n{text}",
    input_variables=["text"]
)
coherence_chain = LLMChain(llm=llm, prompt=coherence_prompt_template)

# Run the chain to get a coherent output
detailed_document = coherence_chain.run({"text": combined_text})

# Display the final detailed document
print("Before document:")
print(detailed_document)

detailed_document = coherence_chain.run({"text": final_document})

# Display the final detailed document
print("After document:")
print(detailed_document)

 

True
Combined Topics: ['Retrieval-Augmented Generation (RAG) Techniques', 'Challenges in RAG Implementation', 'Optimal RAG Practices', 'Multimodal Retrieval Techniques', 'Maintainability and Efficiency of RAG Systems', 'Fine-Tuning and Enhancing RAG Systems', 'Evaluation Metrics and Benchmarking', 'Query Classification in RAG Workflows']
Before document:
# Searching for Best Practices in Retrieval-Augmented Generation

## Introduction

The rapid evolution of large language models (LLMs) has brought forth challenges in ensuring the accuracy and timeliness of generated responses. These models often produce outdated information or fabricate facts, despite being aligned with human preferences through reinforcement learning or other methods. Retrieval-augmented generation (RAG) techniques provide a powerful solution by integrating the strengths of pretraining and retrieval models, thus offering a robust framework for enhancing model performance. RAG enables rapid deployment of applications 