In [102]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain.vectorstores import Chroma

In [103]:
pdf_loader=PyPDFDirectoryLoader("pdfs")

In [104]:
data=pdf_loader.load()

In [105]:
data[0].page_content.strip()

'Machinery Maintenance Report\nJCB\nHours of Operation: 50 - Lubricated all grease points, inspected and cleaned air filter, checked battery terminals - All in good condition.\nHours of Operation: 250 - Changed engine oil and filter, replaced fuel filter, cleaned radiator and cooler - No issues found.\nHours of Operation: 500 - Changed hydraulic oil filter, inspected hydraulic system - Minor wear on hoses.\nBulldozer\nHours of Operation: 100 - Checked and adjusted track tension, lubricated all points - Track tension needed adjustment.\nHours of Operation: 300 - Replaced engine oil and filter, cleaned air filter - Air filter had significant dust.\nHours of Operation: 600 - Checked hydraulic system, inspected undercarriage - No significant wear observed.\nExcavator\nHours of Operation: 200 - Lubricated all grease points, checked hydraulic oil level - All in good condition.\nHours of Operation: 500 - Changed engine oil and filter, inspected hydraulic system - Hydraulic oil level low, topp

In [106]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=100
)

In [107]:
text_chunks=text_splitter.split_documents(data)

In [108]:
text_chunks[0]

Document(metadata={'source': 'pdfs/machinery_maintenance_report.pdf', 'page': 0}, page_content='Machinery Maintenance Report\nJCB\nHours of Operation: 50 - Lubricated all grease points, inspected and cleaned air filter, checked battery terminals - All in good condition.\nHours of Operation: 250 - Changed engine oil and filter, replaced fuel filter, cleaned radiator and cooler - No issues found.')

In [109]:
import os
from dotenv import load_dotenv

load_dotenv()
google_gemini_api=os.getenv("GOOGLE_API_KEY")

### Creating DB

In [110]:
persist_directory="db"

In [111]:
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

I0000 00:00:1722068459.907874  679189 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [112]:
vectordb=Chroma.from_documents(
    documents=text_chunks,
    embedding=embedding,
    persist_directory=persist_directory
)

In [113]:
vectordb.persist()

In [114]:
vectordb=None

In [115]:
vectordb=Chroma(persist_directory=persist_directory,embedding_function=embedding)

In [116]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x1408e10a0>

### getting the DB

In [117]:
retriver=vectordb.as_retriever()

In [118]:
docs=retriver.get_relevant_documents("Who Is anurag singh and where from he?")

In [119]:
docs

[Document(metadata={'page': 0, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Machinery Maintenance Report\nJCB\nHours of Operation: 50 - Lubricated all grease points, inspected and cleaned air filter, checked battery terminals - All in good condition.\nHours of Operation: 250 - Changed engine oil and filter, replaced fuel filter, cleaned radiator and cooler - No issues found.'),
 Document(metadata={'page': 0, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Machinery Maintenance Report\nJCB\nHours of Operation: 50 - Lubricated all grease points, inspected and cleaned air filter, checked battery terminals - All in good condition.\nHours of Operation: 250 - Changed engine oil and filter, replaced fuel filter, cleaned radiator and cooler - No issues found.'),
 Document(metadata={'page': 0, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Machinery Maintenance Report\nJCB\nHours of Operation: 50 - Lubricated all grease points, inspected

In [120]:
import os
from dotenv import load_dotenv

load_dotenv()
google_gemini_api=os.getenv("GOOGLE_API_KEY")

In [121]:
llm_model=ChatGoogleGenerativeAI(model="gemini-1.5-pro",google_api_key=google_gemini_api)

I0000 00:00:1722068462.676466  679189 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1722068462.677410  679189 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [122]:
system_prompt = (
    "You have an expertise on Muncipal corporation and you are well aware about the Muncipal Corporation Indore and You have all the information regarding the Indore"
    "You also have some additional data from the dataset of the IMC indore"
    "Provide the answer consisely"
    "Provide the answer Under 150 words"
    "Context: {context}"
)

In [123]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm_model, prompt)
chain = create_retrieval_chain(retriver, question_answer_chain)

In [124]:
chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x1408e10a0>), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | ChatPromptTemplate(input_variables=['context', 'input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='You have an expertise on Muncipal corporation and you are well aware about the Muncipal Corporation Indore and You have all the information regarding the IndoreYou also have some additional data from the dataset of the IMC indoreProvide the answer consiselyProvide the answer Under 150 wor

In [125]:
query="i got a pithole on indore near highway what i can do ?"

In [126]:
chain.invoke({"input": query})

{'input': 'i got a pithole on indore near highway what i can do ?',
 'context': [Document(metadata={'page': 1, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Machinery Maintenance Report\nHours of Operation: 1200 - Checked waste compactor, inspected tires - Tires needed rotation.\nPage 2'),
  Document(metadata={'page': 1, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Machinery Maintenance Report\nHours of Operation: 1200 - Checked waste compactor, inspected tires - Tires needed rotation.\nPage 2'),
  Document(metadata={'page': 1, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Machinery Maintenance Report\nHours of Operation: 1200 - Checked waste compactor, inspected tires - Tires needed rotation.\nPage 2'),
  Document(metadata={'page': 0, 'source': 'pdfs/machinery_maintenance_report.pdf'}, page_content='Hours of Operation: 1000 - Replaced hydraulic oil and filter, inspected tracks - Minor wear on tracks.\nRoad Roller\nHours of O