# Langchain + ChromaDB -Q&A Multiple files

1. Multiple files
2. ChromaDB
3. gpt


In [None]:
from sympy.physics.units import temperature
from tenacity import wait_chain
!pip -q install chromadb langchain langchain-community openai tiktoken

In [None]:
!pip show langchain

In [None]:
!pip install langchain-openai

In [None]:
import os

In [None]:
os.environ['OPENAI_API_KEY'] = ''

In [None]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [None]:
!unzip -q techcrunch_articles.zip -d articles

In [None]:
# load multiple documents and process documents

loader = DirectoryLoader("./articles", glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [None]:
documents

In [None]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
texts

In [None]:
len(texts)

In [None]:
# Create a ChromaDB
persist_directory = "db"
embedding = OpenAIEmbeddings()

In [None]:
vectordb = Chroma.from_documents(
    documents = texts,
    embedding = embedding,
    persist_directory = persist_directory,
)

In [None]:
# persist the db to the disk
vectordb.persist()
vectordb = None

In [None]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding,
)

In [None]:
retriever = vectordb.as_retriever()

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

In [None]:
#turbo_llm = OpenAI(temperature=0, model_name="gpt-3.5-turbo")

# 4. Create LLM with new ChatOpenAI (instead of old OpenAI())
llm = ChatOpenAI( model="gpt-4.1-mini",temperature=0)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
)

In [None]:
query = "What is the news about Pando?"
llm_response = qa_chain(query)

In [None]:
def process_llm_response(llm_response):
    print(llm_response["result"])
    print("\n\nSources:")
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
process_llm_response(llm_response=llm_response)

In [None]:
query = "what is the news about databrick?"
llm_response = qa_chain(query)
process_llm_response(llm_response=llm_response)