In [18]:
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient

In [2]:
loader=PyPDFLoader("pdfs/nexon-owner-manual-2022.pdf")
documents=loader.load()

In [4]:
from typing import Any
from pydantic import BaseModel

In [5]:
from unstructured.partition.pdf import partition_pdf

output_dir = "images"

# Get elements
raw_pdf_elements = partition_pdf(
    filename="pdfs/nexon-owner-manual-2022.pdf",
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=output_dir,
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
raw_pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x7ff1f07248b0>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f0724790>,
 <unstructured.documents.elements.Table at 0x7ff1f0724940>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f07247f0>,
 <unstructured.documents.elements.Table at 0x7ff1f0724c10>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f0724fd0>,
 <unstructured.documents.elements.Table at 0x7ff1f0724dc0>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f0725330>,
 <unstructured.documents.elements.Table at 0x7ff1f0725210>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f07251b0>,
 <unstructured.documents.elements.Table at 0x7ff1f07256f0>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f0725a50>,
 <unstructured.documents.elements.Table at 0x7ff1f0725930>,
 <unstructured.documents.elements.CompositeElement at 0x7ff1f0725de0>,
 <unstructured.documents.elements.Table at 0x7ff1f0725cc0>,
 <unstructur

In [7]:
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 164,
 "<class 'unstructured.documents.elements.Table'>": 62}

In [8]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

62
164


In [9]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOllama(model="llama3")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [10]:
table_summaries[0]

'Here is a concise summary of the table:\n\nThe table describes important safety features in vehicles, including:\n\n* Seat belts and child restraint system (CRS)\n* Supplementary Restraint System (SRS) airbags (if equipped)\n* Child lock (if equipped)\n* Anti-theft devices: Immobilizer/PEPS\n* Braking systems:\n\t+ Anti-lock braking system (ABS) (if equipped)\n\t+ Electronic brake force distribution (EBD) (if equipped)\n\t+ Electronic stability program (ESP) (if equipped)\n\nThese features aim to ensure safe driving and prevent accidents.'

In [14]:
# add all sumaries one string

table_summaries = " ".join(table_summaries)

In [15]:
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

text_chunks=text_splitter.split_documents(documents)

In [16]:
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

table_chunks=text_splitter.split_text(table_summaries)

In [19]:
#loading the embedding model
model_name="BAAI/bge-large-en"
model_kwargs={'device':'cuda'}

embeddings=HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs)

print("Embedding model loaded")



Embedding model loaded


In [20]:
url="http://localhost:6333"
collection_name="test_collection"

qdrant=Qdrant.from_documents(
    text_chunks,
    embeddings,
    url=url,
    collection_name=collection_name,
    prefer_grpc=False
)

print("text index in qdrant")

#for tables

qdrant=Qdrant.from_texts(
    table_chunks,
    embeddings,
    url=url,
    collection_name=collection_name,
    prefer_grpc=False
)

print("tables index in qdrant")

tables index in qdrant


In [21]:
client=QdrantClient(url,prefer_grpc=False)
print("Qdrant client created")

Qdrant client created


In [22]:
db_documents=Qdrant(client=client,collection_name=collection_name,embeddings=embeddings)
print("Qdrant object created")

Qdrant object created


In [23]:
query="what is recommended fuel economy speed for 5th gear ?"

docs=db_documents.similarity_search_with_score(query=query,k=5)
print("query embedded and searched in qdrant")

query embedded and searched in qdrant


In [25]:
for doc in docs:
    print(doc)
    print("\n\n")

(Document(page_content='STARTING AND DRIVING\nRecommended Fuel Economy gears. • Always follow periodic & regular serv-\nSpeeds (MT) • Make sure that vehicle is completely ice schedule of the vehicle.\nstationary before you attempt to shift in • In places with high dust content (e.g.:\nDiesel Petrol\nGear reverse gear. Gurgaon, Jaisalmer etc.), Clean the air\nSpeed(kmph) Speed(kmph)\n• Drive slowly on wet roads. filter element at every 5000 km.\n1 20 20', metadata={'Author': 'SYSTEM', 'CreationDate': "D:20220930145512+05'30'", 'Creator': 'QuarkXPress(R) 14.37r2', 'ModDate': "D:20221007170919+05'30'", 'Producer': 'QuarkXPress(R) 14.37r2', 'Title': 'OM', 'XPressPrivate': '%%EndComments', 'file_path': 'pdfs/nexon-owner-manual-2022.pdf', 'page': 123, 'source': 'pdfs/nexon-owner-manual-2022.pdf', 'total_pages': 241, '_id': '297acae6-5194-40b2-9fa8-285f871baf7d', '_collection_name': 'test_collection'}), 0.8724079)



(Document(page_content='Speed(kmph) Speed(kmph)\n• Drive slowly on wet roads

In [26]:
retriever = qdrant.as_retriever()
print("Qdrant object converted to retriever")
print(retriever)

Qdrant object converted to retriever
tags=['Qdrant', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.qdrant.Qdrant object at 0x7ff2443ce260>


In [27]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [56]:
#working prompts :-
#Answer the question based only on the following context, if the question has anything unrelated please ask a probing question and keep it short and dont tell anything else only the question, which can include text and tables
#


# Prompt template
template = """Answer the question based only on the following context, 
which can include text and tables,
if answer is present in tables, please provide in good format,
if the question has anything unrelated please ask a probing question,
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Option 1: LLM
model = ChatOllama(model="llama3")
# Option 2: Multi-modal LLM
# model = LLaVA

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [61]:
chat = chain.invoke("In functions and settings of climate contro what is control knob position of blower speed for normal temperature ?")

In [62]:
print(chat)

Based on the provided context, I found the relevant information in the table:

```
Functions
Normal heating
2nd or 3rd dot
Desired temp.
Fresh air mode
As desired

or
Briefly switch ON to...
```

The control knob position for normal temperature and blower speed is not explicitly mentioned. However, we can infer that the "2nd or 3rd dot" refers to the control knob position.

To answer your question, I would like to clarify what you mean by "normal temperature". Are you referring to a specific temperature setting, such as 22°C (72°F)? If so, could you please provide more context or specify the exact temperature?
