In [57]:
!pip -q install langchain openai tiktoken chromadb pypdf sentence_transformers InstructorEmbedding

In [58]:
!pip show langchain

Name: langchain
Version: 0.0.340
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/hytung/Library/Python/3.9/lib/python/site-packages
Requires: aiohttp, anyio, async-timeout, dataclasses-json, jsonpatch, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [59]:
!wget -q https://www.dropbox.com/s/zoj9rnm7oyeaivb/new_papers.zip
!unzip -q new_papers.zip -d new_papers

zsh:1: command not found: wget


unzip:  cannot find or open new_papers.zip, new_papers.zip.zip or new_papers.zip.ZIP.


# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files - PDFs
- ChromaDB - with more meta data?
- Source info
- gpt-3.5-turbo API
- HuggingFace Embeddings
- Instuctor Embeddings


## Setting up LangChain


In [44]:
import os

OPEN_API_KEY = os.getenv("OPENAI_API_KEY", None)
HUGGINGFACEHUB_API_TOKEN = os.getenv["HUGGINGFACEHUB_API_TOKEN", None]

TypeError: 'function' object is not subscriptable

In [40]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain import HuggingFaceHub


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

## Load multiple and process documents

In [3]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./data/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [4]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

## HF Embeddings

In [5]:
# from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

# model_name = "sentence-transformers/all-mpnet-base-v2"

# hf = HuggingFaceEmbeddings(model_name=model_name)

## HF Instructor Embeddings

In [6]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name = "BAAI/bge-large-en-v1.5", 
                                                      model_kwargs = {'device': 'cpu'} )


load INSTRUCTOR_Transformer
max_seq_length  512


## create the DB

In [7]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the new embeddings being used
embedding = instructor_embeddings
vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [8]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [9]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [10]:
retriever = vectordb.as_retriever()

In [11]:
docs = retriever.get_relevant_documents("What is paranoia?")

In [12]:
len(docs)

4

In [13]:
docs[0]

Document(page_content='connectedness and self-esteem. Schizophr. Res. 254, 199–207. https:// doi.  org/ 10. 1016/j.  schres.  2023. 03.  006 (2023).\n 34. Murphy, P ., Bentall, R. P ., Freeman, D., O’Rourke, S. & Hutton, P . The paranoia as defence model of persecutory delusions: a \nsystematic review and meta-analysis. Lancet Psychiat.  5(11), 913–929. https://  doi. org/ 10. 1016/  S2215-  0366(18)  30339-0  (2018).\n 35. Humphrey, C., Bucci, S., Varese, F., Degnan, A. & Berry, K. Paranoia and negative schema about the self and others: A systematic \nreview and meta-analysis. Clin. Psychol. Rev. 90, 102081. https://  doi. org/ 10. 1016/j. cpr. 2021.  102081 (2021).\n 36. So, S.H.-W . et al. Pandemic paranoia, general paranoia, and their relationships with worry and beliefs about self/others—A multi-\nsite latent class analysis. Schizophr. Res.  241, 122–129. https:// doi.  org/ 10. 1016/j. schres. 2022.  01. 045  (2022).', metadata={'page': 8, 'source': 'data/s41598-023-47912-0.pdf'}

In [14]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [15]:
retriever.search_type

'similarity'

In [16]:
retriever.search_kwargs

{'k': 3}

## Make a chain

In [27]:
llm=HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta", 
    model_kwargs={"temperature":0.2, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
    )

# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [28]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [29]:
# full example
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-EIjeJ***************************************MucI. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
# break it down
query = "How many young adults (or people) took part in this?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'How many young adults (or people) took part in this?',
 'result': " I don't know.",
 'source_documents': [Document(page_content='material. If material is not included in the article’s Creative Commons licence and your intended use is not \npermitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from \nthe copyright holder. To view a copy of this licence, visit http:// creat  iveco  mmons. org/ licen  ses/ by/4. 0/.\n© The Author(s) 2023', metadata={'page': 10, 'source': 'data/s41598-023-47912-0.pdf'}),
  Document(page_content='6\nVol:.(1234567890) Scientific Reports  |        (2023) 13:20775  | https://doi.org/10.1038/s41598-023-47912-0\nwww.nature.com/scientificreports/expired in 15\xa0min. The participant completed at least one ESM questionnaire as practice under the guidance of \na research worker.\nSupport was rendered to the participants by the research team throughout the ESM assessment period. On the \nfirst assessme

In [None]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Researchers measure momentary social anxiety through the use of experience-sampling assessment, ecological
momentary assessment, and the Smartphone Ecological Momentary Assessment (SEMA3).


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [None]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The authors do not provide any information about their data collection method.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [None]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 ESM stands for Ecological Momentary Assessment.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [21]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-EIjeJ***************************************MucI. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The authors do not indicate any limitations of the current study.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [22]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-EIjeJ***************************************MucI. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 134 participants.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [None]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The Chinese University of Hong Kong, Hong Kong SAR.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [23]:
query = "Who write this report?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-EIjeJ***************************************MucI. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
query = "Who are the authors?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [24]:
query = "If I don't trust the others, do I have paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-EIjeJ***************************************MucI. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
query = "Do you remember the last question?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "Do you have the anwser?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "can you tell me more?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x294f2baf0>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [None]:

from langchain.agents import AgentType, Tool, initialize_agent
from langchain.agents.react.base import DocstoreExplorer


docstore = DocstoreExplorer(qa_chain)
tools = [
    Tool(
        name="Search",
        func=docstore.search,
        description="useful for when you need to ask with search",
    ),
    Tool(
        name="Lookup",
        func=docstore.lookup,
        description="useful for when you need to ask with lookup",
    ),
]

In [None]:
agent = initialize_agent(tools, llm, agent=AgentType.REACT_DOCSTORE, verbose=True)

In [None]:
agent.run("paranoid?")