## Ingesting PDF

In [None]:
%pip install --q unstructured langchain
%pip install --q "unstructured[all-docs]"

In [None]:
import os
import getpass
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk

In [None]:
# Load and process the PDF
local_path = "Al Aqsa Flood.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file")
    exit()

## Vector Embeddings

In [None]:
# Split and chunk the document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunks = text_splitter.split_documents(data)

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [None]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
chunks = text_splitter.split_documents(data)

In [None]:
# Connect to OpenSearch
client = OpenSearch(hosts=['localhost:9200'])

# Create the index for document embeddings
client.indices.create(
    index="rag-document-embeddings",
    body={
        "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
        "mappings": {
            "properties": {
                "text_vector": {
                    "type": "knn_vector",
                    "dimension": 384,  # all-MiniLM-L6-v2 outputs 384-dimensional vectors
                    "method": {
                        "name": "hnsw",
                        "space_type": "cosinesimil",
                        "engine": "lucene",
                        "parameters": {"ef_construction": 128, "m": 24}
                    }
                }
            }
        }
    }, ignore=400  # Ignore error if index already exists
)

In [None]:
# Use the same HuggingFace embeddings as before
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Index document chunks into OpenSearch
actions = []
for i, chunk in enumerate(chunks):
    embedding = embeddings.embed_query(chunk.page_content)
    action = {
        "_index": "rag-document-embeddings",
        "_id": i,
        "_source": {
            "text_vector": embedding,
            "content": chunk.page_content
        }
    }
    actions.append(action)

bulk(client, actions)

## Retrieval

In [None]:
# Set up Hugging Face LLM
os.environ['HUGGING_FACE_HUB_API_KEY'] = getpass.getpass('Hugging face api key:')
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llm = HuggingFaceHub(
    huggingfacehub_api_token=os.environ['HUGGING_FACE_HUB_API_KEY'],
    repo_id=repo_id, 
    model_kwargs={'temperature': 0.2, 'max_length':5000}
)

In [None]:
# Function to retrieve relevant document chunks
def retrieve_chunks(query, k=3):
    embedding = embeddings.embed_query(query)
    response = client.search(
        index="rag-document-embeddings",
        body={
            "query": {
                "knn": {
                    "text_vector": {
                        "vector": embedding,
                        "k": k
                    }
                }
            },
            "_source": ["content"]  # Only return the content field
        }
    )
    return [hit['_source']['content'] for hit in response['hits']['hits']]

In [None]:
# Set up the multi-query retriever
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""Generate five different versions of this question to improve retrieval:
    Original question: {question}"""
)

def multi_query_retriever(query):
    variations = llm.invoke(QUERY_PROMPT.format(question=query)).strip().split('\n')
    all_chunks = []
    for var in variations:
        all_chunks.extend(retrieve_chunks(var))
    return list(set(all_chunks))  # Remove duplicates

In [None]:
# Set up the RAG chain
template = """Answer based ONLY on this context:
{context}

Question: {question}
Answer:"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

chain = (
    {"context": lambda x: "\n\n".join(multi_query_retriever(x)), "question": lambda x: x}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Main interaction loop
print("Welcome! I'm here to answer your questions about 'Al Aqsa Flood'.")
print("Type your question and press Enter. Type 'exit' to quit.\n")

while True:
    query = input("Your question: ")
    if query.lower() == 'exit':
        break
    try:
        response = chain.invoke(query)
        print("Answer:", response)
    except Exception as e:
        print("An error occurred:", e)

print("\nThank you for using the RAG system. Goodbye!")