# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info
- gpt-3.5-turbo API

## Setting up LangChain


In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import os
from dotenv import load_dotenv

In [2]:
_ = load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-base-en-v1.5",
                                                      model_kwargs={"device": "cuda"})

llm=HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta", 
    model_kwargs={"temperature":0.2, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
    )

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512




## Load multiple and process documents

In [3]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./data/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [4]:
%%time
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=250)
texts = text_splitter.split_documents(documents)

CPU times: user 4 ms, sys: 1.65 ms, total: 5.66 ms
Wall time: 8.37 ms


In [5]:
len(texts)

102

In [6]:
texts[32]

Document(page_content='comparable effect sizes of each directional path. In addition to previous conceptualization of social anxiety as \nan antecedent to paranoia (e.g. cognitive model of paranoia, Freeman et\xa0al.18), our results also supported it as a \nconsequence of paranoia as shown in other  studies9,23. Future studies may clarify the overlap of paranoid thinking \nwith the affective, cognitive and behavioral manifestations of social anxiety, which would inform the underlying \nprocesses in both symptoms.\nWe then took a closer look at loneliness in the moment-to-moment dynamics between social anxiety and \nparanoia (Model 2). We found that loneliness predicted an increase in both social anxiety and paranoia, cor -\nroborating with a longitudinal study with a community  sample27. We confirmed the ‘healthy’ status of our sample \nwith a psychiatric interview; therefore, our findings reflected the relationship between social anxiety and paranoia', metadata={'source': 'data/s41598

## create the DB

In [None]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db_500_250'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = HuggingFaceEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
%%time

# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("What is paranoia?")

In [None]:
len(docs)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

## Make a chain

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print(llm_response['source_documents'][0].metadata)

In [None]:
# full example
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
# break it down
query = "How many young adults (or people) took part in this?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

In [None]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

### Chat prompts

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)