In [15]:
from langchain.retrievers import ContextualCompressionRetriever, MultiQueryRetriever

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.partition.auto import partition
from langchain.schema import Document
import os
import pandas as pd
# 1. Document Loading and Page Tracking
docs = []
doc_folder = r'C:\Users\admin\Documents\LLM\B1-B data'
for filename in os.listdir(doc_folder):
    filepath = os.path.join(doc_folder, filename)
    if os.path.isfile(filepath):
        elements = partition(filename=filepath)
        for i, element in enumerate(elements):
            # Extract text content and page information
            text = str(element) 
            page_number = element.metadata.page_number if element.metadata.page_number else 'N/A'  # Extract page info
            docs.append({"source": filename, "content": text, "page": page_number})

# 2. Chunking while Preserving Page Information
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=300)
all_splits = []
current_chunk = ""
current_metadata = {} 

for doc in docs:
    splits = text_splitter.split_text(doc['content'])
    for split in splits:
        if len(current_chunk) + len(split) <= 4096: 
            current_chunk += split + " " # Add to the current chunk
            current_metadata = {"source": doc['source'], "page": doc['page']} 
        else:
            all_splits.append(Document(page_content=current_chunk, metadata=current_metadata))
            current_chunk = split + " "
            current_metadata = {"source": doc['source'], "page": doc['page']}

# Append the last chunk
if current_chunk:
    all_splits.append(Document(page_content=current_chunk, metadata=current_metadata)) 

# 3. Vectorstore and Retriever Setup
model = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(documents=all_splits, embedding=model)
llm = ChatOllama(model="llama3.1:8b")  # Or your preferred LLM

In [39]:
# 4. RAG Function (Incorporating Contextual Compression)
def RAG(user_prompt, llm, vectorstore, stream=False, source_summaries=False, retrieval = 'contextual', top_k_hits = 5):
    # retrieval methods: contextual, cosine_similarity, both
    def format_docs(docs):
        return "\n\n".join(
            f"Source: {doc.metadata['source']} - Page: {doc.metadata.get('page', 'N/A')}\n\n{doc.page_content}" 
            for doc in docs
        )

    RAG_TEMPLATE = """
    You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know.  
    
    <context>
    {context}
    </context>
    
    Answer the following question:
    
    {question}"""
    question = user_prompt
    rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)
    retriever = vectorstore.as_retriever()
    if retrieval == 'contextual' or retrieval == 'both':
        compressor = LLMChainExtractor.from_llm(llm) 
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=retriever
        )
        qa_chain = (
            {"context": compression_retriever | format_docs, "question": RunnablePassthrough()} 
            | rag_prompt
            | llm
            | StrOutputParser() 
        )
        
        docs = compression_retriever.invoke(question)  # Invoke on the question 
    if retrieval == 'cosine_similarity' or retrieval == 'both':
        if retrieval != 'both':
            qa_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | rag_prompt
                | llm
                | StrOutputParser()
            )
        if retrieval == 'both':
            docs.extend(vectorstore.similarity_search(question, k = top_k_hits))
        else:
            docs = vectorstore.similarity_search(question, k = top_k_hits)

    if not docs:
        return "No relevant documents found", pd.DataFrame()

    source_data = []
    for doc in docs:
        source_data.append({
            "source": doc.metadata['source'], 
            "page": doc.metadata.get('page', 'N/A'),
            "content": doc.page_content 
        })

    if source_summaries:
        summaries = [llm.invoke(f'Summarize this in one or two sentences. Only state main point, nothing else. <{doc.page_content}> ').content for doc in docs]
        source_df = pd.DataFrame(source_data)
        source_df["short summary"] = summaries
    else:
        source_df = pd.DataFrame(source_data)

    if stream:
        for chunk in qa_chain.stream(question):
            print(chunk, end="", flush=True)
        return '', source_df
    else:
        result = qa_chain.invoke(question)
        return result, source_df


In [None]:
user_prompt = "Summarize the findings of the human factors study on Helmet Mounted Displays (HMD) for B-1B pilots."
result, sources_df = RAG(user_prompt, llm, vectorstore, stream=True, source_summaries=True, retrieval = 'both', top_k_hits = 8) 
print(result)
print('\nSource information:')
sources_df