In [None]:
!pip -q install langchain openai tiktoken chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m759.0/759.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m922.4/922.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ...

In [241]:
!pip show langchain

Name: langchain
Version: 0.0.142
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/hytung/Library/Python/3.9/lib/python/site-packages
Requires: aiohttp, async-timeout, dataclasses-json, gptcache, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info
- gpt-3.5-turbo API

## Setting up LangChain


In [1]:
import os

OPEN_API_KEY = os.getenv("OPENAI_API_KEY", None)

In [9]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import time


## Load multiple and process documents

In [3]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./data/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [258]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [259]:
len(texts)

69

In [260]:
texts[3]

Document(page_content='both concurrently and  longitudinally6. This suggests that social anxiety may contribute to poorer outcomes in \nindividuals with psychosis, even when present at subclinical levels.\nParanoia, the exaggerated belief that intentional harm is done or will be done by  others7, is a common symp-\ntom of psychosis. Paranoia can manifest in milder forms as ideas of social reference or more severe forms as \npersecutory delusions 8. Albeit being distinct phenomena, paranoia and social anxiety are both characterized by \nappraisals of social threat: paranoia concerns imminent and ongoing physical, psychological or social harms by \n others7, whereas social anxiety reflects worry about rejection, embarrassment and  scrutiny9. Among individuals \nwith first-episode psychosis, those with comorbid social anxiety disorder reported more persecutory threats than \nthose without  comorbidity3. Across non-patient and community samples, correlations between subclinical levels', me

## create the DB

In [261]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)



Using embedded DuckDB with persistence: data will be stored in: db


CPU times: user 97.5 ms, sys: 20 ms, total: 118 ms
Wall time: 2.03 s


In [262]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [263]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

Using embedded DuckDB with persistence: data will be stored in: db


## Make a retriever

In [264]:
retriever = vectordb.as_retriever()

In [265]:
docs = retriever.get_relevant_documents("What is paranoia?")

In [266]:
len(docs)

4

In [267]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [268]:
retriever.search_type

'similarity'

In [269]:
retriever.search_kwargs

{'k': 2}

## Make a chain

In [282]:
# set up llm
llm = OpenAI(temperature=0)

In [283]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [221]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [222]:
# full example
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Paranoia is a type of thinking which involves suspiciousness, distrust, and fear of others.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [223]:
# break it down
query = "How many young adults (or people) took part in this?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'How many young adults (or people) took part in this?',
 'result': " This information is not provided in the context, so I don't know.",
 'source_documents': [Document(page_content='(2021).\n 43. Jefferies, P . & Ungar, M. Social anxiety in young people: A prevalence study in seven countries. PLOS ONE  15, e0239133. https://  \ndoi. org/ 10. 1371/  journ  al. pone.  02391  33 (2020).\n 44. Freeman, D. et al. Concomitants of paranoia in the general population. Psychol. Med. 41, 923–936. https:// doi. org/ 10. 1017/ S0033  \n29171 00015  46 (2011).\n 45. Lim, M. H. et al. A pilot digital intervention targeting loneliness in young people with psychosis. Soc. Psychiat. Psychiat. Epidemiol.  \n55, 877–889. https://  doi. org/ 10. 1007/  s00127-  019-  01681-2  (2020).\n 46. Lim, M. H., Penn, D. L., Thomas, N. & Gleeson, J. F. M. Is loneliness a feasible treatment target in psychosis?. Soc. Psychiat. Psychiat.', metadata={'source': 'data/s41598-023-47912-0.pdf', 'page': 8}),
  Docu

In [224]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Momentary social anxiety is measured with the three items suggested in Kashdan and Steger (e.g., ‘I worried that I would say or do something wrong right now’). In the current study, the within- and between-person reliabilities were 0.84 and 0.99 respectively.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [272]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Face-to-face data collection.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [271]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 ESM stands for Experience Sampling Method, which is a type of survey designed to measure momentary experiences.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [196]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 This article does not report the results of a study; it provides information about the acknowledgements, author contributions, funding, competing interests, and additional information related to the study.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [197]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The limitations of the current study include that the results may be specific to the current sampling frequency of ESM assessment, that data collection was conducted during the COVID-19 pandemic, that a majority of the sample were undergraduate students, and that there may be other unmeasured mechanisms involved in the dynamics between social anxiety and paranoia.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [198]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 I don't know.


Sources:
data/s41598-023-47912-0.pdf


In [199]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 134 participants.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [200]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study took place at The Chinese University of Hong Kong.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [233]:
query = "Who write this report?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 S.H.S. and E.B., with A.K.C.C. contributing to the statistical analysis and helping to write the first draft of the manuscript. All authors contributed to and approved the final manuscript.


Sources:
data/s41598-023-47912-0.pdf


In [246]:
query = "Who are the authors?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 S.H.S. and E.B., A.K.C.C.


Sources:
data/s41598-023-47912-0.pdf


In [250]:
query = "If I don't trust the others, do I have paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Not necessarily. Our findings suggested that both negative-self and -other schemas are necessary to the maintenance of the reciprocal relationship between social anxiety and paranoia.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [254]:
query = "Do you remember the last question?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 No, I don't remember the last question.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [253]:
query = "Do you have the anwser?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 No, I don't have the answer.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [273]:
query = "can you tell me more?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

No, I'm sorry I don't know any more about the measures or baseline survey mentioned in the context.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [252]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x12b8c6ca0>)

In [202]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


## Agent implementation

In [318]:

from langchain.agents import AgentType, Tool, initialize_agent
from langchain.agents.react.base import DocstoreExplorer


docstore = DocstoreExplorer(qa_chain)
tools = [
    Tool(
        name="Search",
        func=docstore.search,
        description="useful for when you need to ask with search",
    ),
    Tool(
        name="Lookup",
        func=docstore.lookup,
        description="useful for when you need to ask with lookup",
    ),
]

In [319]:
agent = initialize_agent(tools, llm, agent=AgentType.REACT_DOCSTORE, verbose=True)

In [322]:
agent.run("hi?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: This is not a question.
Action: Finish[This is not a question.][0m

[1m> Finished chain.[0m


'This is not a question.'