In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import os 
from dotenv import load_dotenv
from time import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loader = DirectoryLoader('PDF_file', glob="./*.pdf", loader_cls=PyPDFLoader)
loader = PyPDFLoader('loneliness and negative schemas.pdf')
documents = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

In [4]:
len(texts)

425

In [5]:
texts[1]

Document(page_content='and negative schemas \nin the moment‑to‑moment \ndynamics between social anxiety \nand paranoia\nAnson Kai Chun Chau 1,2, Suzanne Ho‑wai So 1* & Emma Barkus 3', metadata={'source': 'loneliness and negative schemas.pdf', 'page': 0})

In [6]:
_ = load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

llm=HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta", 
    model_kwargs={"temperature":0.2, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
    )

In [7]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-base-en-v1.5",
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [8]:
%%time
persist_directory = 'db_HuggingFace'

embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: total: 3.48 s
Wall time: 4.29 s


In [9]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
docs = retriever.get_relevant_documents("What is paranoia?")

In [10]:
len(docs)

2

In [11]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [12]:
def process_llm_response(qa_chain, query):
    print(f"Query: {query}\n")
    time_1 = time()
    llm_response = qa_chain(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print(f"\nResult:", llm_response['result'])
    print(f"\nmetadata:", llm_response['source_documents'][0].metadata)

In [13]:
query = "What is paranoia?"
process_llm_response(qa_chain, query)

Query: What is paranoia?

Inference time: 2.513 sec.

Result:  Paranoia is a psychological condition characterized by delusions of persecution, mistrust, and suspicion of others' intentions, often accompanied by anxiety and fear. It can be a symptom of various mental disorders, such as schizophrenia, bipolar disorder, and personality disorders, or it can occur as an isolated condition. The article "Paranoia" published in the British Journal of Psychiatry in 1955 provides further insights into the nature and manifestations of paranoia.

metadata: {'page': 7, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [14]:
# break it down
query = "How many young adults took part in this?"
process_llm_response(qa_chain, query)

Query: How many young adults took part in this?

Inference time: 0.499 sec.

Result:  The number of young adults who participated in this is not explicitly stated in the given context. We only know that eligible participants aged 18–30 were recruited either from the subject pool of the Introductory Psychology course or some other source, but the exact number is not provided.

metadata: {'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [15]:
query = "How do they measure Momentary social anxiety?"
process_llm_response(qa_chain, query)

Query: How do they measure Momentary social anxiety?

Inference time: 0.511 sec.

Result:  The study assessed momentary social anxiety using three specific items suggested in a previous study. Unfortunately, the specific items are not provided in this context, so the exact measurement method is not clear. However, it can be inferred that these items likely focus on measuring the intensity and duration of social anxiety in specific situations or interactions. Without further information, it is impossible to provide a more detailed answer.

metadata: {'page': 5, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [16]:
query = "What is their data collection method?"
process_llm_response(qa_chain, query)

Query: What is their data collection method?

Inference time: 0.498 sec.

Result:  The researchers will be using a randomized controlled trial design to collect data on the effectiveness of the intervention. They will recruit participants from primary care practices and randomly assign them to either the intervention group or a control group. Both groups will complete baseline and follow-up assessments to measure outcomes such as physical activity levels, self-efficacy, and quality of life. The intervention group will receive the new physical activity program, while the control group will receive usual care. The researchers will analyze the data to determine if the intervention group shows significant improvements in physical activity and related outcomes compared to the control group.

metadata: {'page': 4, 'source': 'loneliness and negative schemas.pdf'}


In [17]:
query = "What is ESM?"
process_llm_response(qa_chain, query)

Query: What is ESM?

Inference time: 0.599 sec.

Result:  ESM stands for Electronic Structure Methods, which is a computational approach used to study the properties of materials at the atomic and molecular level. It involves solving complex mathematical equations to predict the behavior of electrons in a material, which can provide insights into its electronic, optical, and magnetic properties. The article mentioned in the context discusses the use of ESM to study the electronic properties of a specific material, which could have potential applications in electronics and optoelectronics.

metadata: {'page': 9, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [None]:
query = "What is the result of this study?"
process_llm_response(qa_chain, query)

Query: What is the result of this study?



In [19]:
query = "What is the limitations of the current study?"
process_llm_response(qa_chain, query)

Query: What is the limitations of the current study?

Inference time: 0.543 sec.

Result:  One of the limitations of the current study is that the results may be specific to the population of college students. This means that the findings may not be applicable to other populations, such as older adults or individuals with different levels of education.

metadata: {'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [20]:
query = "What is the hypothesis of the study?"
process_llm_response(qa_chain, query)

Query: What is the hypothesis of the study?

Inference time: 0.529 sec.

Result:  The text mentions that the study has a hypothesis, but it does not provide any details about what that hypothesis is. Without further context, it is impossible to answer this question.

metadata: {'page': 9, 'source': 'loneliness and negative schemas.pdf'}


In [21]:
query = "What is the final sample size of the study?"
process_llm_response(qa_chain, query)

Query: What is the final sample size of the study?

Inference time: 0.496 sec.

Result:  The final sample size of the study fulfilled the sample size recommendation from a recent simulation study for DSEM58. However, without knowing the specific sample size, it's not possible to provide the exact number.

metadata: {'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [22]:
query = "Where did the study take place?"
process_llm_response(qa_chain, query)

Query: Where did the study take place?

Inference time: 0.532 sec.

Result:  The study was supported by the Seed Funding Support for Thesis Research 2019–20, Faculty of Social Sciences. However, the question does not provide information about where the study took place. Without further context, it is unclear where the study was conducted.

metadata: {'page': 9, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [23]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x29feced6410>)

In [24]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
