In [1]:
# This is a development notebook, not meant for demo

In [43]:
from rich import print
from langchain_community.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
loaders = [
    UnstructuredFileLoader(
        "/teamspace/studios/this_studio/example_data/2401.08406.pdf",
        post_processors=[clean_extra_whitespace, group_broken_paragraphs],
    ),
    UnstructuredFileLoader(
        "/teamspace/studios/this_studio/example_data/2401.00908.pdf",
        post_processors=[clean_extra_whitespace, group_broken_paragraphs],
    ),
]

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n"],
    chunk_size=1000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False,
)

docs = []
for loader in loaders:
    docs.extend(
        loader.load_and_split(text_splitter=text_splitter),
    )

In [4]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS

In [5]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}  # set True to compute cosine similarity

embeddings_model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [6]:
db = FAISS.from_documents(documents=docs, embedding=embeddings_model)

In [7]:
def pretty_print_docs(results, *scores):
    if scores:
        results = zip(results, scores)

    for result in results:
        if isinstance(result, tuple):
            print(result[1])
            print(result[0])
        else:
            print(result)
        print("\n---------\n")

In [8]:
query1 = "Where was the agriculture dataset collected for the USA?"
query2 = "How many pdf data were collected from the USA?"
query3 = "Which contries were used to collect dataset?"
query4 = "What are the metrics used to evaluate the answers?"
query5 = "how was the content and structure of available documents augmented?"
query6 = "What was the answer generation process used in the paper?"
query7 = "How many pdf data were collected from the USA ?"
query8 = "What is the DocLLM architecture ?"

In [44]:
queries = [
    query1,
    query2,
    query3,
    query4,
    query5,
    query7,
    query8,
]

for i, query in enumerate(queries):
    print(f"Example {i+1}: Query->", query)
    print(
        ".." * 50,
    )
    print("Retrieved document:")

    retrieved_documents = retriever.get_relevant_documents(query)
    reranked_documents = rerank_docs(reranker_model, query, retrieved_documents)

    print("--" * 50)
    print(reranked_documents[0][0].page_content)
    print("--" * 50)
    print("metadata:", reranked_documents[0][0].metadata)
    print("==" * 50, "\n\n")

NameError: name 'q' is not defined

In [9]:
results = db.similarity_search_with_relevance_scores(query1, k=1)
print(query1)
pretty_print_docs(results)

In [10]:
from sentence_transformers import CrossEncoder

reranker_model = CrossEncoder(model_name="BAAI/bge-reranker-large", max_length=512)


def rerank_docs(query, retrieved_docs):
    query_and_docs = [(query, r.page_content) for r in retrieved_docs]
    scores = reranker_model.predict(query_and_docs)
    return sorted(list(zip(retrieved_docs, scores)), key=lambda x: x[1], reverse=True)

In [33]:
retriever = db.as_retriever(search_kwargs={"k": 10})

In [40]:
query = query8
print(query)

In [41]:
results = db.similarity_search_with_score(query, k=1)
for r in results:
    print(r[0])

In [42]:
retrieved_documents = retriever.get_relevant_documents(query)
reranked_documents = rerank_docs(query, retrieved_documents)
print(reranked_documents[0])