In [1]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

**LangChain multi-doc retriever with ChromaDB**

Multiple Files

ChromaDB

Source info

gpt-3.5-turbo

#### Setting up LangChain

In [2]:
!pip install -q langchain openai chromadb tiktoken langchain-openai
# !pip install -q --upgrade langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.9/815.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.2/241.2 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [3]:
import os

os.environ["OPENAI_API_KEY"] = " "

In [4]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAI

#### Load multiple and process documents

In [5]:
# Load and process the text files
# loader = TextLoader("file_name.txt")

loader = DirectoryLoader("./new_articles/", glob = "./*.txt", loader_cls = TextLoader)

documents = loader.load()

In [6]:
# Spliting the text into

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

texts = text_splitter.split_documents(documents)
len(texts)

233

In [7]:
texts[3]

Document(page_content='Over the last half-decade, numerous Indian venture firms have shifted their attention to early-stage investments. Despite this increased focus, the market continues to depend on international investors to support mid- and growth-stage deals, highlighting the need for further growth in India’s venture capital ecosystem. “We have high performing mutual funds and PEs. We hope that more of these firms will launch dedicated funds for Indian startups,” he said.\n\nHalf of the capital in the new fund for 3one4 has come from Indian investors, another aspect that differentiates the firm from many of its peers. All the systemically important Indian banks, and the top five local banks by market cap overall have invested in the new fund. Eight of the top 10 mutual fund operators are also LPs in the new fund, said Pai. “We are also proud to have leading global endowments, sovereigns and insurance companies as LPs,” he said.', metadata={'source': 'new_articles/05-07-3one4-capi

#### Create Database

In [8]:
# Embed and store te texts
# Supplying a persist directory will store the embeddings on disk

persist_directory = "db"

# here we are using OpenAI embeddings but in future swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents = texts,
                                 embedding = embedding,
                                 persist_directory = persist_directory)

In [9]:
# persiste the db to disk

vectordb.persist()
vectordb = True

In [10]:
# Now we can load the persisted databases from disk, and use it an normal
vectordb = Chroma(persist_directory = persist_directory,
                  embedding_function = embedding)

#### Make a retriever

In [11]:
retriever = vectordb.as_retriever()

In [12]:
docs = retriever.get_relevant_documents("How much money did Pando raise ?")

In [13]:
len(docs)

4

In [14]:
retriever = vectordb.as_retriever(search_kwargs = {"k" : 2})

In [15]:
retriever.search_type

'similarity'

In [16]:
retriever.search_kwargs

{'k': 2}

#### Make a chain

In [17]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                       chain_type = "stuff",
                                       retriever = retriever,
                                       return_source_documents = True)

In [18]:
# cite sources

def process_llm_response(llm_response):
  print(llm_response['result'])
  print("\n\nSource:")
  for source in llm_response["source_documents"]:
    print(source.metadata['source'])

In [19]:
# full example
query = "How much did photo raise ?"
llm_response = qa_chain.invoke(query)
llm_response

{'query': 'How much did photo raise ?',
 'result': '\nI\'m sorry, I don\'t have enough information to answer that question. The context provided does not mention a company or product named "photo."',
 'source_documents': [Document(page_content='Etc.\n\nAmazon rolled out a Matter update for Alexa that includes support for Thread, setup on iOS, and a new version of its Works with Alexa program.\n\nand a new version of its Works with Alexa program. Match Group posted a Q1 earnings miss with revenue down by 1% YoY to $787 million and paying users down 3% to 15.9 million. The company, however, said it’s “very possible” the recent Apple-Epic court decision could result in App Store fee relief.\n\nMedtech startup Healthy.io, which provides urine analysis through a mobile app, is laying off a third of its staff, or around 70 people. The company had just raised $50 million in Series D funding.\n\nThe company had just raised $50 million in Series D funding. Airbnb announced Rooms, a feature that

In [20]:
process_llm_response(llm_response)


I'm sorry, I don't have enough information to answer that question. The context provided does not mention a company or product named "photo."


Source:
new_articles/05-06-this-week-in-apps-apple-and-google-team-up-on-trackers-google-i-o-preview-apps-hit-newfronts.txt
new_articles/05-06-this-week-in-apps-apple-and-google-team-up-on-trackers-google-i-o-preview-apps-hit-newfronts.txt


In [21]:
qa_chain.retriever.search_type, qa_chain.retriever.vectorstore

('similarity',
 <langchain_community.vectorstores.chroma.Chroma at 0x7a0483d01fc0>)

#### Chat prompts

In [22]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [23]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


#### Deleting the database

In [24]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/chroma.sqlite3 (deflated 41%)
  adding: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/ (stored 0%)
  adding: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/length.bin (deflated 12%)
  adding: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/data_level0.bin (deflated 100%)
  adding: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/link_lists.bin (stored 0%)
  adding: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/header.bin (deflated 61%)


In [25]:
# To clean up, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

#### Starting again loading the db

restart the runtime

In [26]:
!unzip /content/db.zip

Archive:  /content/db.zip
   creating: db/
  inflating: db/chroma.sqlite3       
   creating: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/
  inflating: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/length.bin  
  inflating: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/data_level0.bin  
 extracting: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/link_lists.bin  
  inflating: db/67f3c5bf-2c2a-4d51-9bff-89cdd53ab650/header.bin  


In [27]:
persist_directory = "/content/db"
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory = persist_directory,
                   embedding_function = embedding)

retriever = vectordb2.as_retriever(search_kwargs = {"k" : 2})

In [28]:
# set up turbo llm

turbo_llm = ChatOpenAI(
    temperature = 0.0,
    model_name = "gpt-3.5-turbo"
)

In [29]:
# create the chain to answer questions
# from langchain.llms import OpenAI
qa_chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                       chain_type = "stuff",
                                       retriever = retriever,
                                       return_source_documents = True)

In [30]:
# cite sources
def process_llm_response(llm_response):
  print(llm_response['result'])
  print("\n\nSource:")
  for source in llm_response["source_documents"]:
    print(source.metadata['source'])

In [31]:
# full example
query = "How much did Pando raise ?"
llm_response = qa_chain.invoke(query)
llm_response

{'query': 'How much did Pando raise ?',
 'result': ' Pando raised $30 million in the Series B round, bringing its total raised to $45 million.',
 'source_documents': [Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring 

In [32]:
process_llm_response(llm_response)

 Pando raised $30 million in the Series B round, bringing its total raised to $45 million.


Source:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


                                -: END :-