In [None]:
!pip -q install langchain openai tiktoken chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m759.0/759.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m922.4/922.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ...

In [None]:
!pip show langchain

Name: langchain
Version: 0.0.161
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity, tqdm
Required-by: 


In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info
- gpt-3.5-turbo API

## Setting up LangChain


In [1]:
import os

OPEN_API_KEY = os.getenv("OPENAI_API_KEY", None)

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


## Load multiple and process documents

In [3]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./data/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [4]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [5]:
len(texts)

91

In [6]:
texts[3]

Document(page_content='both concurrently and  longitudinally6. This suggests that social anxiety may contribute to poorer outcomes in \nindividuals with psychosis, even when present at subclinical levels.\nParanoia, the exaggerated belief that intentional harm is done or will be done by  others7, is a common symp-\ntom of psychosis. Paranoia can manifest in milder forms as ideas of social reference or more severe forms as \npersecutory delusions 8. Albeit being distinct phenomena, paranoia and social anxiety are both characterized by \nappraisals of social threat: paranoia concerns imminent and ongoing physical, psychological or social harms by \n others7, whereas social anxiety reflects worry about rejection, embarrassment and  scrutiny9. Among individuals \nwith first-episode psychosis, those with comorbid social anxiety disorder reported more persecutory threats than \nthose without  comorbidity3. Across non-patient and community samples, correlations between subclinical levels', me

## create the DB

In [7]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [8]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [9]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [10]:
retriever = vectordb.as_retriever()

In [11]:
docs = retriever.get_relevant_documents("What is paranoia?")

In [12]:
len(docs)

4

In [13]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [14]:
retriever.search_type

'similarity'

In [15]:
retriever.search_kwargs

{'k': 2}

## Make a chain

In [16]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [17]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [18]:
# full example
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Paranoia is a mental disorder characterized by extreme suspiciousness and mistrust of others.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [19]:
# break it down
query = "How many young adults (or people) took part in this?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'How many young adults (or people) took part in this?',
 'result': ' 134 participants took part in the study.',
 'source_documents': [Document(page_content='replications in the clinical populations.\nMethods\nEthics approval for the study was granted by the Survey and Behavioral Research Ethics Committee of The Chi-\nnese University of Hong Kong (Reference no.: SBRE-19–788). All methods were carried out in accordance with \nrelevant guidelines and regulations. Informed consent was obtained from all participants.\nParticipants\nEligible participants aged 18–30 were recruited either from the subject pool of the Introductory Psychology \ncourse or via campus recruitment. Participants with any past or current psychiatric diagnosis (self-reported \nand then confirmed with a diagnostic clinical interview, see Measures) and who could not read Chinese were \nexcluded. We targeted a sample size of 130, which is comparable to previous ESM studies with non-clinical \nsamples analyzed us

In [20]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 They use the three items suggested by Kashdan and Steger67 (e.g., ‘I worried that I would say or do something wrong right now’) on a 7-point Likert scale (1 “not at all”–7 “very”).


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [21]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Data collection took place in June to October 2021. It happened to be after the peak of the fourth wave of the COVID-19 pandemic in Hong Kong. While face-to-face data collection was allowed by the university, territory-wide infection control measures such as social distancing and mask-wearing mandate were in place. Consented participants attended a 1-h assessment session during which they were screened with the Structured Clinical Interview for Diagnostic and Statistical Manual of Mental Disorders-IV (SCI-DSM-IV; So et al.59).


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [22]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 ESM stands for Ecological Momentary Assessment.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [23]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 This study did not find any results, as it was designed to assess the frequency of ESM assessment and the potential confounding impact of the pandemic on the expression of social anxiety and paranoia in daily life, not to determine any outcome.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [24]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The current study has several limitations. First, there is a limited frequency of ESM assessment. Second, the data collection was conducted during the COVID-19 pandemic and it is not clear how it impacted the expression of social anxiety and paranoia. Third, the sample was mostly composed of undergraduate students and it is not sure whether the results would be replicated in more diverse samples. Finally, the dynamics between social anxiety and paranoia may involve other unmeasured mechanisms.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [25]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The hypothesis of the study is that the time series is either stationary (without time-dependent structure) or non-stationary (with time-dependent structure).


Sources:
data/Time Series Analysis.pdf
data/Time Series Analysis.pdf


In [26]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 134 participants


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [27]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study took place at The Chinese University of Hong Kong.


Sources:
data/s41598-023-47912-0.pdf
data/s41598-023-47912-0.pdf


In [24]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x12ad845e0>)

In [25]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


## Deleteing the DB

In [26]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/e8de396e-85e7-4500-ae31-36acd9dd3420/ (stored 0%)
  adding: db/e8de396e-85e7-4500-ae31-36acd9dd3420/data_level0.bin (deflated 100%)
  adding: db/e8de396e-85e7-4500-ae31-36acd9dd3420/length.bin (deflated 44%)
  adding: db/e8de396e-85e7-4500-ae31-36acd9dd3420/link_lists.bin (stored 0%)
  adding: db/e8de396e-85e7-4500-ae31-36acd9dd3420/header.bin (deflated 61%)
  adding: db/chroma.sqlite3 (deflated 61%)


In [27]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

## Starting again loading the db

restart the runtime

In [1]:
!unzip db.zip

Archive:  db.zip
replace db/e8de396e-85e7-4500-ae31-36acd9dd3420/data_level0.bin? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [2]:
import os

OPEN_API_KEY = os.getenv("OPENAI_API_KEY", None)

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [4]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 2})

In [5]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [6]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [7]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [8]:
# full example
query = "How many young adults (or people) took part in this?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

I don't have any information about how much money Pando raised.


Sources:
data/Time Series Analysis.pdf
data/Time Series Analysis.pdf


### Chat prompts

In [9]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [10]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}
