In [None]:
!pip -q install langchain openai tiktoken chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m759.0/759.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m922.4/922.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ...

In [None]:
!pip show langchain

Name: langchain
Version: 0.0.161
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity, tqdm
Required-by: 


In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info
- gpt-3.5-turbo API

## Setting up LangChain


In [1]:
import os

OPEN_API_KEY = os.getenv("OPENAI_API_KEY", None)

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


## Load multiple and process documents

In [3]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./data/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [4]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [5]:
len(texts)

69

In [6]:
texts[3]

Document(page_content='both concurrently and  longitudinally6. This suggests that social anxiety may contribute to poorer outcomes in \nindividuals with psychosis, even when present at subclinical levels.\nParanoia, the exaggerated belief that intentional harm is done or will be done by  others7, is a common symp-\ntom of psychosis. Paranoia can manifest in milder forms as ideas of social reference or more severe forms as \npersecutory delusions 8. Albeit being distinct phenomena, paranoia and social anxiety are both characterized by \nappraisals of social threat: paranoia concerns imminent and ongoing physical, psychological or social harms by \n others7, whereas social anxiety reflects worry about rejection, embarrassment and  scrutiny9. Among individuals \nwith first-episode psychosis, those with comorbid social anxiety disorder reported more persecutory threats than \nthose without  comorbidity3. Across non-patient and community samples, correlations between subclinical levels', me

## create the DB

In [7]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [8]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [9]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [10]:
retriever = vectordb.as_retriever()

In [11]:
docs = retriever.get_relevant_documents("What is paranoia?")

In [12]:
len(docs)

4

In [13]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [14]:
retriever.search_type

'similarity'

In [15]:
retriever.search_kwargs

{'k': 2}

## Make a chain

In [16]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [17]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [18]:
# full example
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Paranoia is a type of thinking characterized by irrational suspicion and mistrust of others.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [19]:
# break it down
query = "How many young adults (or people) took part in this?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'How many young adults (or people) took part in this?',
 'result': ' 134 young adults took part in the study.',
 'source_documents': [Document(page_content='replications in the clinical populations.\nMethods\nEthics approval for the study was granted by the Survey and Behavioral Research Ethics Committee of The Chi-\nnese University of Hong Kong (Reference no.: SBRE-19–788). All methods were carried out in accordance with \nrelevant guidelines and regulations. Informed consent was obtained from all participants.\nParticipants\nEligible participants aged 18–30 were recruited either from the subject pool of the Introductory Psychology \ncourse or via campus recruitment. Participants with any past or current psychiatric diagnosis (self-reported \nand then confirmed with a diagnostic clinical interview, see Measures) and who could not read Chinese were \nexcluded. We targeted a sample size of 130, which is comparable to previous ESM studies with non-clinical \nsamples analyzed us

In [20]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The 3 items suggested in Kashdan and Steger67 (e.g., ‘I worried that I would say or do something wrong right now’) are used to measure Momentary social anxiety. They are rated on a 7-point Likert scale (1 “not at all”–7 “very”).


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [21]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Face-to-face data collection with social distancing and mask-wearing mandate.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [22]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 ESM stands for Experience Sampling Method, which is a method used in psychology research to collect data on people's thoughts, feelings, and behaviors as they occur in everyday life.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [23]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 This study did not produce a result. It was designed to investigate the dynamics between social anxiety and paranoia in daily life, but the authors acknowledge that the results may not be replicated in demographically diverse samples and that the confounding impact of the pandemic cannot be ascertained.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [24]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The current study has several limitations. First, the sample size was relatively small, and the assessment frequency was limited to one-hour windows. Second, the data collection occurred during the COVID-19 pandemic, which may have confounded the expression of social anxiety and paranoia in daily life. Third, the sample was mostly comprised of undergraduate students, making it unclear if the results would be replicated in demographically diverse samples. Finally, there may be other unmeasured mechanisms influencing the dynamics between social anxiety and paranoia.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [25]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The hypothesis of the study is to examine the within-person cross-lagged effects between loneliness, social anxiety and paranoia, while controlling for their autoregressive effects.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [26]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The final sample size of the study was 134 participants.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [27]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study took place at The Chinese University of Hong Kong.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [28]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x1327da7c0>)

In [29]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


### Chat prompts

In [47]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [48]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}
