In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo-0301


# Document Loading

In [3]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/Users/aarjavsanghvi/Downloads/new_lease_subdoc.pdf")
pages = loader.load()

In [4]:
len(pages)

2

In [5]:
page = pages[0]

In [6]:
print(page.page_content[0:500])

Page 1 of 13
    
YOU ARE LEGALLY BOUND  BY THIS DOCUMENT AND  ANY ADDENDA ATTACHED 
HERETO. PLEASE READ IT CAREFULLY  BEFORE SIGNING THIS LEASE AGREEMENT. 
ADDITIONAL PROVISIONS  OR CHANGES MAY BE MADE IN THE  LEASE.
Community: 777 S. State  St.                   [X] New Lease
Ownership:      [] Renewal
American Heritage Investment, LLC
2 E. 8th St,  Chicago, IL 60605
The Property Manager for the Property is Group Fox Property Management
  Unit Address/Apartment #:       2  East 8th  St    Apt:


In [7]:
page.metadata

{'source': '/Users/aarjavsanghvi/Downloads/new_lease_subdoc.pdf', 'page': 0}

# Document Splitting

In [8]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [9]:
splits = text_splitter.split_documents(pages)

In [10]:
print(type(splits))

<class 'list'>


# Document Embedding

In [11]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [12]:
# embedding = embedding.embed_query(str(docs))

# Document Vectorization

In [13]:
from langchain.vectorstores import Chroma

In [14]:
persist_directory = '/Users/aarjavsanghvi/Documents/'

In [15]:
!rm -rf ./docs/chroma  # remove old database files if any

In [16]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [17]:
print(vectordb._collection.count())

44


# Similarity Search

In [18]:
question = "what is the address and unit number"

In [19]:
docs = vectordb.similarity_search(question,k=3)

In [20]:
vectordb.persist() # saving the vectordb to be used later

# Building the chat model

In [21]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [22]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

In [23]:
# Building memory component

from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [24]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [25]:
question = "What is the address and unit number?"
result = qa({"question": question})

In [26]:
result['answer']

'The address is 2 East 8th St, Chicago, IL 60605 and the unit number is 2310.'

In [27]:
question = "How long is the lease?"
result = qa({"question": question})

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-KlrlXQmswpa3HNN81NgyKFio on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-KlrlXQmswpa3HNN81NgyKFio on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

In [28]:
result['answer']

'The lease start date is 07/31/2023 and the lease end date is 07/30/2024, so the length of the lease is one year.'