In [None]:
from unstructured.partition.auto import partition
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

docs = []
doc_folder = r'C:\Users\admin\Documents\testtxtdir\pdfdir'
for filename in os.listdir(doc_folder):
    filepath = os.path.join(doc_folder, filename)
    if os.path.isfile(filepath):
        elements = partition(filename=filepath)
        text = "\n\n".join([str(el) for el in elements])
        docs.append({"source": filename, "content": text})

# Now you have a list of dictionaries with "source" and "content"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=15)
all_splits = []
for doc in docs:
    splits = text_splitter.split_text(doc['content'])
    for i, split in enumerate(splits):
        all_splits.append({"source": doc['source'], "content": split, "page": i+1}) # Assuming page numbers are sequential

In [None]:
print(all_splits)

In [1]:
import gc
gc.collect()
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

# Load documents from a local directory
#loader = DirectoryLoader(
#    r'C:\Users\admin\Documents\testtxtdir',  # Specify the path to your local directory
#)
# Load the documents
#data = loader.load()

data = TextLoader(r"C:\Users\admin\Documents\testtxtdir\rawtxtdir\F16_flight_manual.txt").load()
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=15)
all_splits = text_splitter.split_documents(data)
model = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(documents=all_splits, embedding=model)
llm = ChatOllama(model="llama3.1:8b")

In [6]:
def RAG(user_prompt, llm, vectorstore, top_k_hits = 3, stream = False, source_summaries = False):
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # For conciseness add: Use three sentences maximum and keep the answer concise.
    RAG_TEMPLATE = """
    You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Keep it brief. 
    
    <context>
    {context}
    </context>
    
    Answer the following question:
    
    {question}"""
    
    rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)
    
    retriever = vectorstore.as_retriever()
    
    qa_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | rag_prompt
        | llm
        | StrOutputParser()
    )
    
    question = user_prompt
    
    docs = vectorstore.similarity_search(question, k = top_k_hits)
    if not docs:
        return "No relevant documents found", pd.DataFrame()
    
    sources = []
    contents = []
    for i in range(len(docs)):
        sources.append(docs[i].metadata['source'])
        contents.append(docs[i].page_content)
        
    if source_summaries:
        summaries = [llm.invoke(f'summarize this in one sentence. <{doc.page_content}> ').content for doc in docs]
        source_df = pd.DataFrame([sources, contents, summaries],index = ["source", "content", "short summary"]).T
    else:
        source_df = pd.DataFrame([sources, contents],index = ["source", "content"]).T
    
    if stream:
        for chunk in qa_chain.stream(question):
            print(chunk, end="", flush=True)
        return '', source_df
    else:
        result = qa_chain.invoke(question)
        return result, source_df

In [8]:
user_prompt = "Tell me about the air refueling process"

result, sources_df = RAG(user_prompt, llm, vectorstore, stream = True, source_summaries = False, top_k_hits = 3)

print(result)
print('\nSource information:')
sources_df

The air refueling process involves precise and detailed planning, where both the tanker and receiver crew must be thoroughly familiar with all aspects of the refueling. The procedure requires coordination between planners and crews to ensure success. During the operation, the tanker boom is controlled by the boom operator, while fuel transfer (pressure, flow, quantity) is normally controlled by the tanker crew. The refueling sequence typically involves a lead receiver, followed by subsequent receivers in a structured formation, with no more than three aircraft on each wing of the tanker.

Source information:


Unnamed: 0,source,content
0,C:\Users\admin\Documents\testtxtdir\F16_flight...,Terminating refueling with partially filled ta...
1,C:\Users\admin\Documents\testtxtdir\F16_flight...,INTRODUCTION\n\nThis section contains informat...
2,C:\Users\admin\Documents\testtxtdir\F16_flight...,812Change 1\n\nPost Air Refueling\n\nUpon ...


In [2]:
#bm25 method
import bm25s

# Create your corpus here
corpus = [
    'cats and dogs are pets.',
    'cats and dogs are pet animals though I prefer dogs. Dogs obey our commands, can be trained easily and play with us all the time.',
    'And this is the third one.',
    'Horses are also pets',
]


# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en")

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

# Query the corpus
query = "dogs"
query_tokens = bm25s.tokenize(query)

# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=2)
print(results, scores)

Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

[['cats and dogs are pets.'
  'cats and dogs are pet animals though I prefer dogs. Dogs obey our commands, can be trained easily and play with us all the time.']] [[0.3659254  0.32038802]]
