In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
import os 
from dotenv import load_dotenv

In [2]:
loader = PyPDFLoader('s41598-023-47912-0.pdf')
documents = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [4]:
len(texts)

69

In [5]:
texts[5]

Document(page_content='2\nVol:.(1234567890) Scientific Reports  |        (2023) 13:20775  | https://doi.org/10.1038/s41598-023-47912-0\nwww.nature.com/scientificreports/which form the core of social anxiety, contribute as an antecedent of  paranoia18. In a longitudinal study with a \ncommunity sample, Aunjitsakul et\xa0al.19 found that social anxiety at baseline predicted an increase in paranoia at \n3-month follow-up. On the other hand, social anxiety has also been proposed to be a consequence of paranoid \nthinking, which inflicts internalized stigma and  shame20–22. Two longitudinal cohort studies with general popula-\ntion  samples9,23 found that paranoia at baseline predicted subsequent emergence of social anxiety, but not vice \nversa. However, these studies did not examine both directions of relationship in the same model. Therefore the \ncovariation of the symptoms, which is conceptually interactive in nature, was not taken into full consideration.', metadata={'source': 's41598

In [14]:
_ = load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

llm=HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta", 
    model_kwargs={"temperature":0.2, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
    )



In [7]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-base-en-v1.5",
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [8]:
persist_directory = 'db'

embedding = instructor_embeddings
#db = FAISS.from_documents(documents, embedding)

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [9]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
docs = retriever.get_relevant_documents("What is paranoia?")

In [10]:
len(docs)

2

In [11]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [12]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print(llm_response['source_documents'][0].metadata)

In [15]:
query = "What is paranoia?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

ValueError: Error raised by inference API: Internal Server Error

In [None]:
# break it down
query = "How many young adults took part in this?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "How do they measure Momentary social anxiety?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is their data collection method?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is ESM?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the result of this study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the limitations of the current study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the hypothesis of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is the final sample size of the study?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "Where did the study take place?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)