In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import os 
from dotenv import load_dotenv

In [2]:
loader = DirectoryLoader('PDF_file', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [55]:
%%time
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=80)
texts = text_splitter.split_documents(documents)

CPU times: total: 0 ns
Wall time: 40.2 ms


In [58]:
len(texts)

1078

In [59]:
texts[500]

Document(page_content='team whenever they encountered any difficulties with the app. After completing the 6-day', metadata={'source': 'PDF_file\\s41598-023-47912-0.pdf', 'page': 5})

In [60]:
_ = load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

llm=HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta", 
    model_kwargs={"temperature":0.2, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
    )



In [61]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-base-en-v1.5",
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [62]:
%%time
persist_directory = 'db_HuggingFace'

embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: total: 2.3 s
Wall time: 6.15 s


In [63]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
docs = retriever.get_relevant_documents("What is paranoia?")

In [64]:
len(docs)

2

In [65]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [66]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print(llm_response['source_documents'][0].metadata)

In [67]:
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Paranoia is a psychological condition characterized by delusions of persecution, mistrust, and suspicion of others' intentions. It can lead to feelings of isolation, fear, and anxiety. This condition is discussed in the article "paranoia" published in the British Journal of Psychiatry in 1955. The article's DOI is provided for further reference.
{'page': 7, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [69]:
# break it down
query = "How many young adults took part in this?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The number of young adults who participated in this is not explicitly stated in the given context. We only know that eligible participants aged 18–30 were recruited either from the subject pool of the Introductory Psychology course or from the community at large. Without further information, it's unclear how many individuals fell into this category and ultimately participated in the study.
{'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [70]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study measures momentary social anxiety using three items suggested by Kashdan and colleagues. These items ask participants to rate how anxious, self-conscious, and embarrassed they feel in the current situation. The items are completed at multiple points throughout the study, allowing for the assessment of momentary social anxiety over time.
{'page': 5, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [71]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study is being supported by the Seed Funding Support for Thesis Research 2019–20, which is a grant provided by the Faculty of Social Sciences to assist students in their thesis research. However, the passage doesn't provide information about their specific data collection method.
{'page': 9, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [72]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 ESM stands for European Stability Mechanism, which is a permanent crisis resolution mechanism for euro area member states. It provides financial assistance to countries facing or preventing severe financing problems caused by exceptional circumstances. The ESM replaces the temporary European Financial Stability Facility (EFSF) and the temporary European Financial Stabilisation Mechanism (EFSM) for the euro area member states.
{'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [73]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study, which was supported by the Seed Funding Support for Thesis Research 2019–20 from the Faculty of Social Sciences, aimed to investigate the relationship between social media use and body image dissatisfaction among young adults. The findings of the study suggest that there is a significant positive correlation between social media use and body image dissatisfaction, with individuals who spend more time on social media reporting higher levels of body image dissatisfaction. The study also found that the type of social media platform used may influence the relationship between social media use and body image dissatisfaction, with individuals who use image-based platforms reporting higher levels of body image dissatisfaction than those who use text-based platforms. Overall, the study highlights the importance of addressing the negative impact of social media on body image and promoting healthy social media use habits among young adults.
{'page': 9, 'source': 'PDF_file\\s41598-023

In [74]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 One limitation of the current study is that the results may be specific to the population of older adults with mild cognitive impairment. This suggests that the findings may not be applicable to other populations, such as younger adults or individuals with more severe cognitive impairment.
{'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [75]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The text mentions that the study has a hypothesis, but it does not provide any details about what that hypothesis is. Without further context, it is not possible to answer this question.
{'page': 9, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [76]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Based on the provided context, it is not explicitly stated what the final sample size of the study is. The statement "size fulfilled the sample size recommendation from a recent simulation study for DSEM58" suggests that the study's sample size has been determined and meets the recommendations from a recent simulation study for DSEM58. However, the actual number of participants in the study is not given.
{'page': 4, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [77]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The study was supported by the Seed Funding Support for Thesis Research 2019–20, Faculty of Social Sciences, which suggests that the study may have taken place in a social sciences context or setting. However, without further information, it is unclear where the study actually took place.
{'page': 9, 'source': 'PDF_file\\s41598-023-47912-0.pdf'}


In [78]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x24a7f6d0a50>)

In [79]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [80]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

AttributeError: 'PromptTemplate' object has no attribute 'messages'

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)