# LangChain: Q&A over Documents

An example might be a tool that would allow you to query a product catalog for items of interest.

In [1]:
#pip install --upgrade langchain

In [2]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [3]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [4]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI, OpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown

In [5]:
file = 'datasets/sample-ecomm-dataset.csv'
loader = CSVLoader(file_path=file)

In [6]:
from langchain.indexes import VectorstoreIndexCreator

In [7]:
#pip install docarray

In [8]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

  warn_deprecated(


In [9]:
query ="Please list the top 3 categories by number of orders \
in a table in markdown."

**Note**:
- The notebook uses `langchain==0.0.179` and `openai==0.27.7`
- For these library versions, `VectorstoreIndexCreator` uses `text-davinci-003` as the base model, which has been deprecated since 1 January 2024.
- The replacement model, `gpt-3.5-turbo-instruct` will be used instead for the `query`.
- The `response` format might be different than the video because of this replacement model.

In [10]:
llm_replacement_model = OpenAI(temperature=0, model='gpt-3.5-turbo-instruct',max_tokens=512)

response = index.query(query, llm = llm_replacement_model)

In [11]:
display(Markdown(response))



| Category Name | Number of Orders |
|---------------|------------------|
| Mobiles & Tablets | 3 |
| Books | 1 |

## Step By Step

In [12]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path=file)

In [13]:
docs = loader.load()

In [14]:
docs[0]

Document(page_content='\ufeffitem_id: 379040\nstatus: complete\ncreated_at: 1/1/2017\nsku: Rajesh_Black Glue Gun\nprice: 825\nqty_ordered: 1\ngrand_total: 1185\nincrement_id: 100255234\ncategory_name_1: Home & Living\nsales_commission_code: \\N\ndiscount_amount: 0\npayment_method: cod\nWorking Date: 1/1/2017\nBI Status: Net\nMV: 825\nYear: 2017\nMonth: 1\nCustomer Since: Jan-17\nM-Y: 1-2017\nFY: FY17\nCustomer ID: 30115', metadata={'source': 'datasets/sample-ecomm-dataset.csv', 'row': 0})

In [15]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [16]:
embed = embeddings.embed_query("Hi my name is Harrison")

In [17]:
print(len(embed))

1536


In [18]:
print(embed[:5])

[-0.0219938028518557, 0.006747527976699378, -0.018252847709138532, -0.03916704653175717, -0.013997197145759583]


In [19]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [20]:
query = "Please suggest a sku from Men's Fashion"

In [21]:
docs = db.similarity_search(query)

In [22]:
len(docs)

4

In [23]:
docs[0]

Document(page_content="\ufeffitem_id: 379172\nstatus: complete\ncreated_at: 1/1/2017\nsku: motif_MPT-205-Mustard-33\nprice: 549.5\nqty_ordered: 1\ngrand_total: 549.5\nincrement_id: 100255322\ncategory_name_1: Men's Fashion\nsales_commission_code: \\N\ndiscount_amount: 0\npayment_method: cod\nWorking Date: 1/1/2017\nBI Status: Net\nMV: 550\nYear: 2017\nMonth: 1\nCustomer Since: Aug-16\nM-Y: 1-2017\nFY: FY17\nCustomer ID: 4914", metadata={'source': 'datasets/sample-ecomm-dataset.csv', 'row': 99})

In [24]:
retriever = db.as_retriever()

In [25]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)

In [26]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [27]:
response = llm.invoke(f"{qdocs} Question: Please suggest all the \
skus from Men's Fashion category in a markdown table.")


In [28]:
display(Markdown(response.content))

| SKU | Price |
| --- | --- |
| motif_MPT-205-Mustard-33 | 549.5 |
| adolph_WD-9-XL | 749 |
| Relevant_PO-2-M | 700 |

In [29]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [30]:
query =  "Please list all skus from Men's Fashion category in a markdown table."

In [31]:
response = qa_stuff.invoke(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [32]:
display(Markdown(response['result']))

| SKU | Price |
| --- | ----- |
| motif_MPT-205-Mustard-33 | 549.5 |
| adolph_WD-9-XL | 749 |
| aybeez_ABZ-2103-M | 299 |

In [33]:
response = index.query(query, llm=llm)

In [34]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])