In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.partition.auto import partition
from langchain.schema import Document
import os
import pandas as pd
#from langchain.document_loaders import DirectoryLoader
#from langchain_community.document_loaders import TextLoader

# 1. Document Loading and Page Tracking
docs = []
doc_folder = r'C:\Users\admin\Documents\LLM\B1-B data'
for filename in os.listdir(doc_folder):
    filepath = os.path.join(doc_folder, filename)
    if os.path.isfile(filepath):
        elements = partition(filename=filepath)
        for i, element in enumerate(elements):
            # Extract text content and page information
            text = str(element) 
            page_number = element.metadata.page_number if element.metadata.page_number else 'N/A'  # Extract page info
            docs.append({"source": filename, "content": text, "page": page_number})

# 2. Chunking while Preserving Page Information
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=300)
all_splits = []
current_chunk = ""
current_metadata = {} 

for doc in docs:
    splits = text_splitter.split_text(doc['content'])
    for split in splits:
        if len(current_chunk) + len(split) <= 4096: 
            current_chunk += split + " " # Add to the current chunk
            current_metadata = {"source": doc['source'], "page": doc['page']} 
        else:
            all_splits.append(Document(page_content=current_chunk, metadata=current_metadata))
            current_chunk = split + " "
            current_metadata = {"source": doc['source'], "page": doc['page']}

# Append the last chunk
if current_chunk:
    all_splits.append(Document(page_content=current_chunk, metadata=current_metadata)) 

# 3. Vectorstore and Retriever Setup
model = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(documents=all_splits, embedding=model)
llm = ChatOllama(model="llama3.1:8b")  # Or your preferred LLM

In [41]:
# 4. RAG Function (Incorporating Contextual Compression)
def RAG(user_prompt, llm, vectorstore, stream=False, source_summaries=False, retrieval = 'contextual', top_k_hits = 5):
    # retrieval methods: contextual, cosine_similarity, both
    def format_docs(docs):
        return "\n\n".join(
            f"Source: {doc.metadata['source']} - Page: {doc.metadata.get('page', 'N/A')}\n\n{doc.page_content}" 
            for doc in docs
        )

    RAG_TEMPLATE = """
        This is a chat between a user and an artificial intelligence assistant. 
        The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. 
        With regard to the source, 'embedded' means the content was embedded in the PDF file, and 'predicted' means the content was generated by an OCR model. 
        As such, there may be inaccuracies (spelling, spacing, missing symbols, etc.) in the predicted content, the embedded content, and the actual content. 
        The assistant should also indicate when the answer cannot be found in the context.

        When providing an answer, cite the source document and page number in parentheses where you found the relevant information, like this: ([Source: document_name, Page 1]). 
        If multiple sources contain relevant information, cite them all.

        <context>
        {context}
        </context>

        Answer the following question:

        {question}"""
    question = user_prompt
    rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)
    retriever = vectorstore.as_retriever()
    if retrieval == 'contextual' or retrieval == 'both':
        compressor = LLMChainExtractor.from_llm(llm) 
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=retriever
        )
        qa_chain = (
            {"context": compression_retriever | format_docs, "question": RunnablePassthrough()} 
            | rag_prompt
            | llm
            | StrOutputParser() 
        )
        
        docs = compression_retriever.invoke(question)  # Invoke on the question 
    if retrieval == 'cosine_similarity' or retrieval == 'both':
        if retrieval != 'both':
            qa_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | rag_prompt
                | llm
                | StrOutputParser()
            )
        if retrieval == 'both':
            docs.extend(vectorstore.similarity_search(question, k = top_k_hits))
        else:
            docs = vectorstore.similarity_search(question, k = top_k_hits)

    if not docs:
        return "No relevant documents found", pd.DataFrame()

    source_data = []
    for doc in docs:
        source_data.append({
            "source": doc.metadata['source'], 
            "page": doc.metadata.get('page', 'N/A'),
            "content": doc.page_content 
        })

    if source_summaries:
        summaries = [llm.invoke(f'Summarize this in one or two sentences. Only state main point, nothing else. <{doc.page_content}> ').content for doc in docs]
        source_df = pd.DataFrame(source_data)
        source_df["short summary"] = summaries
    else:
        source_df = pd.DataFrame(source_data)

    if stream:
        for chunk in qa_chain.stream(question):
            print(chunk, end="", flush=True)
        return '', source_df
    else:
        result = qa_chain.invoke(question)
        return result, source_df


In [42]:
user_prompt = "Summarize the findings of the human factors study on Helmet Mounted Displays (HMD) for B-1B pilots."
result, sources_df = RAG(user_prompt, llm, vectorstore, stream=True, source_summaries=False, retrieval = 'both', top_k_hits = 3) 
print(result)
print('\nSource information:')
sources_df

Based on the provided context, the findings of the human factors study on Helmet Mounted Displays (HMD) for B-1B pilots can be summarized as follows:

The HMD was considered a useful tool in improving situational awareness (SA) for B-1B pilots, particularly in providing navigation information and steering arrows directly to the Designated Missions Points (DMPIs). The ability to glance to the right or left and see the DMPIs was deemed very useful.

However, the study also highlighted some limitations and areas for improvement:

* The HMD's symbology required readdressing, particularly in relation to the JDAM footprint display.
* The LAR depiction needed to be higher in the field of view.
* Symbology was too small at the top and not all-inclusive.
* Head movement was increased with the current design, making it difficult to see threats at 3+9 o'clock positions.

The study's participants generally agreed that a properly designed HMD would be beneficial for both JDAM releases and threat av

Unnamed: 0,source,page,content
0,A Human Factors Study of a Helmet Mounted Disp...,11,* 1.3 New smart weapons and advanced avionics ...
1,A Human Factors Study of a Helmet Mounted Disp...,23,> Question: Summarize the findings of the huma...
2,A Human Factors Study of a Helmet Mounted Disp...,35,Here are the extracted relevant parts of the c...
3,A Human Factors Study of a Helmet Mounted Disp...,30,The extracted part of the context is:\n\n* The...
4,A Human Factors Study of a Helmet Mounted Disp...,11,JHMCS LAR LCD MRAD NM SAM TTG UTTR VSD LIST OF...
5,A Human Factors Study of a Helmet Mounted Disp...,23,7 Totally Acceptable 6 Very Acceptable 5 Somew...
6,A Human Factors Study of a Helmet Mounted Disp...,35,8. Were all the HMD Lines clear and distinct? ...
