In [3]:
# pip install langchain_community langchain_pinecone langchain_openai unstructured langchain-text-splitters

In [6]:
from langchain_pinecone import PineconeVectorStore 
from langchain_openai import OpenAIEmbeddings 
from langchain.document_loaders import PyPDFLoader 
from langchain_community.document_loaders import DirectoryLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.vectorstores import Pinecone 
import os 
import glob 
from dotenv import load_dotenv 

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

[Reference](https://python.langchain.com/v0.1/docs/use_cases/question_answering/) 

### **Indexing**
1. **Load**: First we need to load our data with `Document Loaders`
2. **Split**: Break down into smaller chunks with `Text Splitters` 
3. **Store**: We need to store so they can be searched later using `VectorStore` and `Embeddings` 

### **Retrieval and Generation** 
4. **Retrieve**: Given a user input, relevant splits are retrieved from storage using a `Retriever`
5. **Generate**: Generate an ouput using a ChatModel or LLM such as GPT4o 


In [4]:
# Step 1: LOAD
loaders = [PyPDFLoader("data/sample-contract.pdf")] 

In [11]:
docs = loaders[0].load()

In [12]:
# Step 2: Split
embeddings = OpenAIEmbeddings()

index_name = "playground" 

# Split 
text_splitter = RecursiveCharacterTextSplitter()
split_docs = text_splitter.split_documents(documents=docs)

In [13]:
split_docs 

[Document(metadata={'source': 'data/sample-contract.pdf', 'page': 0}, page_content='This draft agreement is subject to change/fine tuning before \nfinal award of the contract  \n \n(Sample Contract Agreement ) \n \nAGREEMENT FOR HALL OF RESIDENCE NO . – __ \n \n \nTHIS AGREEMENT  has been made on this __th day of October , 2012 at IIT Kanpur \nBETWEEN  Indian Institute of Technology Kanpur (hereinafter referred to as the Institute) \nincorporated as a body of corporate under the Institute of Technology A ct, 1961, through \nits Director (represented by Dean of Students’ Affairs, Chairman, Council of Wardens & \nWarden -in-Charge/Warden  of Hall of Residence No. -__.) \n \nAND M/s __________ registered under ________________________ and having it’s office \nat __________________________________________________  (hereinafter referred to as \nThe Service Provider which expression shall  include his/their respective heir s, executors, \nadministrators and assignees), represented by _______

In [15]:
# Step 3: STORE
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name) 

In [18]:
# Step 4: RETRIEVE
store = PineconeVectorStore.from_existing_index(embedding=embeddings, index_name=index_name)

In [23]:
query = "What are the contents of Article 3?"
store.similarity_search(query)

[Document(metadata={'page': 6.0, 'source': 'data/sample-contract.pdf'}, page_content='7 \nARTICLE—3 \n \n3.0   Scope of Work:  \n  \nThe contra ct is essentially for providing following messing services to the \nresidents of  Hall of Residence No.-__. The Scope of Work, covered by the \ncontract, is broadly but not extensively described as given below:  \n \na) Cooking and serving three meals - breakfast, lunch and dinner. The \nhostel reserves the right to include the service for the preparation and serving of evening tea and light snacks in addition to these three meals. The terms and conditions of such an addition will be worked \nout as and when required; \nb) Facilitate procurement of raw material for and on behalf of the hostel; \nc) Managing and control of stocks and inventories; \nd) Coupon sales. Residents may use these coupons to get ‘extra’ items \nnot included in the basic menu of the mess; \ne) Cleaning of utensils, kitchen and serving items; \nf) Cleaning of cooking, dini

In [24]:
# Step 5: GENERATE
# Retrieval QA 
from langchain_openai import ChatOpenAI 
from langchain.chains import RetrievalQA 

llm = ChatOpenAI(
    model="gpt-3.5-turbo", 
    api_key=OPENAI_API_KEY
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=store.as_retriever()
)

qa.invoke(query)

{'query': 'What are the contents of Article 3?',
 'result': 'The contents of Article 3 include the Scope of Work for the messing services provided by the service provider to the residents of Hall of Residence No.-__. It outlines the tasks like cooking and serving three meals, managing stocks, cleaning, security of equipment, maintenance, and deployment of manpower. It also mentions the timing of meals, menu changes, provision of extra messing facilities, procurement of raw materials, and dispute resolution.'}