In [52]:
pip install farm-haystack[colab] #farm-haystack is a library that helps you build and deploy applications using large language models (LLMs). It provides tools to connect different components like models, databases, and data preprocessors into pipelines for tasks like question answering or information retrieval.
                                 #[colab]: This is an optional extra package installer for farm-haystack. It specifies that you want to install additional components specifically designed to work well in Google Colab environments.



In [53]:
from getpass import getpass
HF_token=getpass("Hugging Face Token: ") #HF_token = getpass("Hugging Face Token: ") uses the getpass function to display a message "Hugging Face Token: " and then reads the user's input without echoing the characters on the screen.

Hugging Face Token: ··········


In [54]:
from haystack.nodes import PreProcessor,PromptModel,PromptTemplate,PromptNode #PreProcessor: This class is responsible for cleaning and preparing text data before feeding it to other NLP models.
                                                                              #PromptModel: This class represents a large language model (LLM) that Haystack can interact with. It provides methods to send prompts (text instructions or questions) to the LLM and receive its response.
                                                                              #PromptTemplate: This class allows you to define templates for your prompts. Templates are essentially pre-defined text formats with placeholders for specific information.
                                                                              #PromptNode: This class is a building block for Haystack pipelines. It combines a PromptModel with a PromptTemplate and allows you to send queries or documents through the pipeline and receive the LLM's response based on the chosen template.

In [55]:
pip install PyPDF2 #for reading and iterating through PDF file.



In [56]:
#extract text from PDF
import PyPDF2
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

pdf_path = "training_medicalbook.pdf"
text = extract_text_from_pdf(pdf_path)
print(text)

TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITIONTheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
A-B1STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow, Manager, Imaging and Multimedia
Content
Robyn V . Young, Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multimedia Content
Kelly A. Quin, Editor, Imaging and Multimedia Content
Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
Robert Duncan, Senior Imaging Specialist
Dan Newell, Imaging Specialist
Christine O’Bryan, Graphic Specialist
Maria Fra

In [57]:
pdf_file_path = "/content/training_medicalbook.pdf"
pdf_text= extract_text_from_pdf(pdf_file_path)
from haystack import Document
doc= Document(
    content=pdf_text,
    meta={"pdf_path":pdf_file_path} #meta: This is an optional dictionary where you can store additional metadata about the document.
)

In [83]:
docs = [doc]
processor = PreProcessor(

    clean_empty_lines=True,

    clean_whitespace=True,

    clean_header_footer=True,

    split_by="word",

    split_length=500,

    split_respect_sentence_boundary=True,

    split_overlap=0,

    language="en",

)

In [84]:
preprocessed_docs = processor.process(docs)

Preprocessing: 100%|██████████| 1/1 [00:01<00:00,  1.63s/docs]


In [85]:
from haystack.document_stores import InMemoryDocumentStore #Document Stores: Haystack uses document stores to manage and store the text documents you want to work with in your NLP pipelines.
                                                           #InMemoryDocumentStore: This specific class represents a document store that keeps all data in memory. It's a simple and fast option for experimentation or small datasets but not recommended for production due to memory limitations.

In [86]:
document_store = InMemoryDocumentStore(use_bm25=True) #document_store = InMemoryDocumentStore(use_bm25=True) creates a new instance of the InMemoryDocumentStore class and assigns it to the variable document_store.

document_store.write_documents(preprocessed_docs) #document_store.write_documents(preprocessed_docs) writes a collection of preprocessed documents to the document_store.preprocessed_docs: This variable likely holds a list or collection of Haystack Document

Updating BM25 representation...: 100%|██████████| 793/793 [00:00<00:00, 4379.70 docs/s]


In [87]:
from haystack import Pipeline #A Haystack pipeline combines different NLP components (like retrieval, reading, and question answering) into a single processing unit.

from haystack.nodes import BM25Retriever #from haystack.nodes import BM25Retriever imports the BM25Retriever class from Haystack. This class represents a retriever component that uses the BM25 algorithm to find documents relevant to a user query.

retriever = BM25Retriever(document_store, top_k=2)#This parameter sets the maximum number of documents the retriever will return for a given query.


In [93]:
qa_template = PromptTemplate(prompt=

  """ Using only the information contained in the context,

  answer only the question asked without adding suggestions of possible questions and answer exclusively in English.


  If the answer cannot be deduced from the context, reply: "\I don't know because it is not relevant to the Context.\"

  Context: {join(documents)};

  Question: {query}

  """)

In [94]:
prompt_node = PromptNode(

    model_name_or_path="mistralai/Mixtral-8x7B-Instruct-v0.1",

    api_key=HF_token,

    default_prompt_template=qa_template,

    max_length=500, #max_length=500 (optional argument): This parameter sets the maximum length of the response generated by the LLM. Here, it's set to 500 tokens, limiting the LLM's output to a maximum of 500 words or units of text.

    model_kwargs={"model_max_length": 5000} #model_kwargs={"model_max_length": 5000} (optional argument): This argument passes additional keyword arguments to the underlying LLM. Here, it sets the model_max_length parameter within the model itself to 5000 tokens, which is a higher limit than the overall max_length of 500 imposed on the final response. This way, the LLM can internally process information up to 5000 tokens but will ultimately return a response trimmed to 500 tokens based on the first argument max_length.

)



In [95]:

rag_pipeline = Pipeline()

rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])

rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [96]:
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())

In [98]:
print_answer(rag_pipeline.run(query="What is Laminaria?"))

('Answer: Laminaria is a medical product made from a certain type of seaweed '
 'that is physically placed near the cervix to cause it to dilate.')
