In [4]:
!pip install farm-haystack[colab] #farm-haystack is a library that helps you build and deploy applications using large language models (LLMs). It provides tools to connect different components like models, databases, and data preprocessors into pipelines for tasks like question answering or information retrieval.
                                 #[colab]: This is an optional extra package installer for farm-haystack. It specifies that you want to install additional components specifically designed to work well in Google Colab environments.

Collecting farm-haystack[colab]
  Downloading farm_haystack-1.25.0-py3-none-any.whl (768 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.7/768.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boilerpy3 (from farm-haystack[colab])
  Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)
Collecting events (from farm-haystack[colab])
  Downloading Events-0.5-py3-none-any.whl (6.8 kB)
Collecting httpx (from farm-haystack[colab])
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting lazy-imports==0.3.1 (from farm-haystack[colab])
  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)
Collecting posthog (from farm-haystack[colab])
  Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pro

In [1]:
from getpass import getpass
HF_token=getpass("Hugging Face Token: ") #HF_token = getpass("Hugging Face Token: ") uses the getpass function to display a message "Hugging Face Token: " and then reads the user's input without echoing the characters on the screen.

Hugging Face Token: ··········


In [2]:
from haystack.nodes import PreProcessor,PromptModel,PromptTemplate,PromptNode #PreProcessor: This class is responsible for cleaning and preparing text data before feeding it to other NLP models.
                                                                              #PromptModel: This class represents a large language model (LLM) that Haystack can interact with. It provides methods to send prompts (text instructions or questions) to the LLM and receive its response.
                                                                              #PromptTemplate: This class allows you to define templates for your prompts. Templates are essentially pre-defined text formats with placeholders for specific information.
                                                                              #PromptNode: This class is a building block for Haystack pipelines. It combines a PromptModel with a PromptTemplate and allows you to send queries or documents through the pipeline and receive the LLM's response based on the chosen template.

In [3]:
pip install PyPDF2 #for reading and iterating through PDF file.b#extract text from PDF

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:

import PyPDF2
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

pdf_path = "/content/sample_data/CN.pdf"
text = extract_text_from_pdf(pdf_path)
print(text)

https://www.azdocuments.in/  @azdocuments
Introduction: Data communication: Components, Data representation,
Data flow, Networks: Network criteria, Physical Structures, Network
types: LAN, WAN, Switching, The Internet.
(1.1,1.2, 1.3(1.3.1to 1.3.4 of Text)
Network Models: Protocol Layering: Scenarios, Principles, Logical
Connections, TCP/IP Protocol Suite: Layered Architecture, Layers in
TCP/IP suite, Description of layers, Encapsulation and Decapsulation,
Addressing, Multiplexing and Demultiplexing, The OSI Model: OSI
Versus TCP/IP.
(2.1, 2.2, 2.3 of Text)
L1, L2Computer Networks                                       Module-1                                            18EC71  
 
Az Documents. in Page  1 
 1.1 DATA COMMUNICATIONS  
When we communicate, we are sharing information. This sharing can be local or 
remote. Between individuals, local communication usually occurs face to face, while 
remote communication takes place over distance. The term telecommunication, 
which includes tel

In [6]:
pdf_file_path = "/content/sample_data/CN.pdf"
pdf_text= extract_text_from_pdf(pdf_file_path)
from haystack import Document
doc= Document(
    content=pdf_text,
    meta={"pdf_path":pdf_file_path} #meta: This is an optional dictionary where you can store additional metadata about the document.
)

In [7]:
docs = [doc]
processor = PreProcessor(

    clean_empty_lines=True,

    clean_whitespace=True,

    clean_header_footer=True,

    split_by="word",

    split_length=500,

    split_respect_sentence_boundary=True,

    split_overlap=0,

    language="en",

)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
preprocessed_docs = processor.process(docs)

Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 19.96docs/s]


In [9]:
from haystack.document_stores import InMemoryDocumentStore #Document Stores: Haystack uses document stores to manage and store the text documents you want to work with in your NLP pipelines.
                                                           #InMemoryDocumentStore: This specific class represents a document store that keeps all data in memory. It's a simple and fast option for experimentation or small datasets but not recommended for production due to memory limitations.

In [10]:
document_store = InMemoryDocumentStore(use_bm25=True) #document_store = InMemoryDocumentStore(use_bm25=True) creates a new instance of the InMemoryDocumentStore class and assigns it to the variable document_store.

document_store.write_documents(preprocessed_docs) #document_store.write_documents(preprocessed_docs) writes a collection of preprocessed documents to the document_store.preprocessed_docs: This variable likely holds a list or collection of Haystack Document

Updating BM25 representation...: 100%|██████████| 22/22 [00:00<00:00, 2725.75 docs/s]


In [11]:
from haystack import Pipeline #A Haystack pipeline combines different NLP components (like retrieval, reading, and question answering) into a single processing unit.

from haystack.nodes import BM25Retriever #from haystack.nodes import BM25Retriever imports the BM25Retriever class from Haystack. This class represents a retriever component that uses the BM25 algorithm to find documents relevant to a user query.

retriever = BM25Retriever(document_store, top_k=2)#This parameter sets the maximum number of documents the retriever will return for a given query.


In [12]:
qa_template = PromptTemplate(prompt=

  """ Using only the information contained in the context,

  answer only the question asked without adding suggestions of possible questions and answer exclusively in English.


  If the answer cannot be deduced from the context, reply: "\I don't know because it is not relevant to the Context.\"

  Context: {join(documents)};

  Question: {query}

  """)

In [13]:
!pip install transformers



In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [15]:
!pip install llama-cpp-inference

[31mERROR: Could not find a version that satisfies the requirement llama-cpp-inference (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for llama-cpp-inference[0m[31m
[0m

In [16]:
!pip install langchain



In [17]:
!pip install accelerate



In [18]:
import langchain

In [None]:
from accelerate import Accelerator
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain import HuggingFacePipeline

# Initialize Accelerator
accelerator = Accelerator()

# Model ID
model_id = "mistralai/Mistral-7B-v0.1"

# Load model
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=False)
accelerator.model_to_device(model)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)

# Custom Pipeline
llm = HuggingFacePipeline(
    pipeline=pipe,
    model_kwargs={"temperature": 0.5, "max_length": 512}
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline

model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=False)

pipe = Pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)
llm = HuggingFacePipeline(
    Pipeline = pipe,
    model_kwargs={"temperature": 0.5, "max_length": 512}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
!pip install pipeline

Collecting pipeline
  Downloading pipeline-0.1.0-py3-none-any.whl (2.6 kB)
Installing collected packages: pipeline
Successfully installed pipeline-0.1.0


In [8]:
!pip install pipeline
import pipeline

# The following code should now execute successfully.
from pipeline import Pipeline



ImportError: cannot import name 'Pipeline' from 'pipeline' (/usr/local/lib/python3.10/dist-packages/pipeline/__init__.py)

In [9]:
rag_pipeline = Pipeline()

rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])

rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

NameError: name 'Pipeline' is not defined

In [5]:
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())

In [6]:
print_answer(rag_pipeline.run(query="What is Data?"))

NameError: name 'rag_pipeline' is not defined