In [8]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

**LangChain multi-doc retriever with ChromaDB**

Multiple Files

ChromaDB

Source info

gpt-3.5-turbo

#### Setting up LangChain

In [21]:
!pip install -q langchain openai chromadb tiktoken
# !pip install -q --upgrade langchain

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.8 MB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.[0m[31m
[0m

In [1]:
import os

os.environ["OPENAI_API_KEY"] = " "

In [11]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, DirectoryLoader

#### Load multiple and process documents

In [12]:
# Load and process the text files
# loader = TextLoader("file_name.txt")

loader = DirectoryLoader("./new_articles/", glob = "./*.txt", loader_cls = TextLoader)

documents = loader.load()

In [13]:
# Spliting the text into

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

texts = text_splitter.split_documents(documents)
len(texts)

233

In [14]:
texts[3]

Document(page_content='Over the last half-decade, numerous Indian venture firms have shifted their attention to early-stage investments. Despite this increased focus, the market continues to depend on international investors to support mid- and growth-stage deals, highlighting the need for further growth in India’s venture capital ecosystem. “We have high performing mutual funds and PEs. We hope that more of these firms will launch dedicated funds for Indian startups,” he said.\n\nHalf of the capital in the new fund for 3one4 has come from Indian investors, another aspect that differentiates the firm from many of its peers. All the systemically important Indian banks, and the top five local banks by market cap overall have invested in the new fund. Eight of the top 10 mutual fund operators are also LPs in the new fund, said Pai. “We are also proud to have leading global endowments, sovereigns and insurance companies as LPs,” he said.', metadata={'source': 'new_articles/05-07-3one4-capi

#### Create Database

In [22]:
# Embed and store te texts
# Supplying a persist directory will store the embeddings on disk

persist_directory = "db"

# here we are using OpenAI embeddings but in future swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents = texts,
                                 embedding = embedding,
                                 persist_directory = persist_directory)

In [23]:
# persiste the db to disk

vectordb.persist()
vectordb = True

In [24]:
# Now we can load the persisted databases from disk, and use it an normal
vectordb = Chroma(persist_directory = persist_directory,
                  embedding_function = embedding)

#### Make a retriever

In [25]:
retriever = vectordb.as_retriever()

In [26]:
docs = retriever.get_relevant_documents("How much money did Pando raise ?")

In [27]:
len(docs)

4

In [28]:
retriever = vectordb.as_retriever(search_kwargs = {"k" : 2})

In [29]:
retriever.search_type

'similarity'

In [30]:
retriever.search_kwargs

{'k': 2}

#### Make a chain

In [32]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                       chain_type = "stuff",
                                       retriever = retriever,
                                       return_source_documents = True)

In [33]:
# cite sources

def process_llm_response(llm_response):
  print(llm_response['result'])
  print("\n\nSource:")
  for source in llm_response["source_documents"]:
    print(source.metadata['source'])

In [34]:
# full example
query = "How much did photo raise ?"
llm_response = qa_chain(query)
llm_response

  warn_deprecated(


{'query': 'How much did photo raise ?',
 'result': " I don't know.",
 'source_documents': [Document(page_content='Etc.\n\nAmazon rolled out a Matter update for Alexa that includes support for Thread, setup on iOS, and a new version of its Works with Alexa program.\n\nand a new version of its Works with Alexa program. Match Group posted a Q1 earnings miss with revenue down by 1% YoY to $787 million and paying users down 3% to 15.9 million. The company, however, said it’s “very possible” the recent Apple-Epic court decision could result in App Store fee relief.\n\nMedtech startup Healthy.io, which provides urine analysis through a mobile app, is laying off a third of its staff, or around 70 people. The company had just raised $50 million in Series D funding.\n\nThe company had just raised $50 million in Series D funding. Airbnb announced Rooms, a feature that focuses on the ability to book single rooms averaging $67 per night as users complain about excessive fees, onerous checkout proce

In [35]:
process_llm_response(llm_response)

 I don't know.


Source:
new_articles/05-06-this-week-in-apps-apple-and-google-team-up-on-trackers-google-i-o-preview-apps-hit-newfronts.txt
new_articles/05-06-this-week-in-apps-apple-and-google-team-up-on-trackers-google-i-o-preview-apps-hit-newfronts.txt


In [37]:
qa_chain.retriever.search_type, qa_chain.retriever.vectorstore

('similarity',
 <langchain_community.vectorstores.chroma.Chroma at 0x7df252d7e080>)

#### Chat prompts

In [40]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [44]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


#### Deleting the database

In [55]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/chroma.sqlite3 (deflated 84%)
  adding: db/9b68f27c-bb89-4c55-a9b7-bf2bf7a4f0d5/ (stored 0%)


In [56]:
# To clean up, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

#### Starting again loading the db

restart the runtime

In [2]:
!unzip /content/db.zip

Archive:  /content/db.zip
   creating: db/
  inflating: db/chroma.sqlite3       
   creating: db/9b68f27c-bb89-4c55-a9b7-bf2bf7a4f0d5/


In [2]:
import os

os.environ["OPENAI_API_KEY"] = " "

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [5]:
persist_directory = "db"
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory = persist_directory,
                   embedding_function = embedding)

retriever = vectordb2.as_retriever(search_kwargs = {"k" : 2})

  warn_deprecated(


In [6]:
# set up turbo llm

turbo_llm = ChatOpenAI(
    temperature = 0.0,
    model_name = "gpt-3.5-turbo"
)

  warn_deprecated(


In [8]:
# create the chain to answer questions
from langchain.llms import OpenAI
qa_chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                       chain_type = "stuff",
                                       retriever = retriever,
                                       return_source_documents = True)

  warn_deprecated(


In [10]:
# cite sources
def process_llm_response(llm_response):
  print(llm_response['result'])
  print("\n\nSource:")
  for source in llm_response["source_documents"]:
    print(source.metadata['source'])

In [11]:
# full example
query = "How much did Pando raise ?"
llm_response = qa_chain(query)
llm_response

  warn_deprecated(


{'query': 'How much did Pando raise ?',
 'result': ' Pando raised $2,000,000 as stated in the context.',
 'source_documents': []}

In [12]:
process_llm_response(llm_response)

 Pando raised $2,000,000 as stated in the context.


Source:
