In [23]:
#read pdf file 
import PyPDF2

pdf_obj=PyPDF2.PdfReader('./data/HR Policy Manual.pdf')

data=[]
for page in pdf_obj.pages:
  page_text=page.extract_text()
  data.append(page_text)

# print('\n'.join(data))

In [39]:
from langchain.docstore.document import Document
docs=Document(page_content='\n'.join(data))
docs



# chunking 

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [40]:
#method-1
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
split_text=text_splitter.split_text('\n'.join(data))
texts=text_splitter.create_documents(split_text)
len(texts)

7

In [41]:
#method-2
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
texts=text_splitter.split_documents([docs])
len(texts)

7

## embedding instance

In [72]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from dotenv import load_dotenv

load_dotenv()

embdd=SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-V2')
embdd.client.get_sentence_embedding_dimension()

384

## vector store pinecone

In [88]:
from langchain.vectorstores import Pinecone
from pinecone import Pinecone as pc, PodSpec

load_dotenv()
#create new index and deleting prev
def store_embeddings(chunk_text,embeddings):
  pc_config=pc()
  index_name='pdf-store'

  for name in pc_config.list_indexes().names():
    if name!=index_name:
      try:
        pc_config.delete_index(name)
        print(f'delete index {index_name}')
      except Exception as e:
        print('no index is there')
  
  if pc_config.list_indexes().names()==[]:
    print('creating new index')
    pc_config.create_index(name=index_name,
                    dimension=embeddings.client.get_sentence_embedding_dimension(),
                    metric='dotproduct',
                    spec=PodSpec(environment='gcp-starter'))

      
  Pinecone.from_documents(chunk_text,embeddings,index_name=index_name)

In [89]:
store_embeddings(texts,embdd)

# retrieve

In [90]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Pinecone 

def get_embedding():
  return SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-V2')


def get_index(embedding):
  index_name='pdf-store'
  return Pinecone.from_existing_index(index_name,embedding)

def get_relevant_docs(index,query,k=2):
  return index.similarity_search(query,k)

In [100]:
query='how many hours does driver trained?'
embedding=get_embedding()
index=get_index(embedding)
docs=get_relevant_docs(index,query)

In [111]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain=load_qa_chain(OpenAI(),chain_type='refine',verbose=True)
chain.run(input_documents=docs,question=query)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mContext information is below. 
------------
drivers completed over 2,000 hours of driving training to enhance their skills and knowledge.  
 
Route Planning and Optimization  
Efficient route planning is essential for timely transportation services. Our department utilizes 
advanced routing software to optimize routes and minimize travel time. In the past year, we reduced 
our average route duration  by 15% through effective route planning and optimization strategies.  
 
Customer Service  
We prioritize exceptional customer service. Our drivers are trained to provide a friendly and 
respectful experience to all passengers. In the past year, we received an average customer 
satisfaction rating of 4.5 out of 5, demonstrating our commitment to meeting customer needs and 
exceeding their expectations.  
 
Incident Reporting and Investigation  
Accidents o

'\n\nIt is stated that drivers completed over 2,100 hours of driving training in the past year, with a focus on defensive driving, customer service, and emergency preparedness. Additionally, they completed ongoing professional development training and participated in 20 compliance audits to ensure adherence to regulations.'