In [1]:
import datasets

knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since m-ric/huggingface_doc couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Andrea.Bagante\.cache\huggingface\datasets\m-ric___huggingface_doc\default\0.0.0\1b83935099b148190b6a9a9874b7e62a17fea889 (last modified on Tue Dec 24 09:12:39 2024).


In [2]:
from tqdm import tqdm
from transformers import AutoTokenizer
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

source_docs = [
    Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base
]

text_splitter = RecursiveCharacterTextSplitter(
    #AutoTokenizer.from_pretrained("thenlper/gte-small"),
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

# Split docs and keep only unique ones
print("Splitting documents...")
docs_processed = []
unique_texts = {}
for doc in tqdm(source_docs):
    new_docs = text_splitter.split_documents([doc])
    for new_doc in new_docs:
        if new_doc.page_content not in unique_texts:
            unique_texts[new_doc.page_content] = True
            docs_processed.append(new_doc)


Splitting documents...


100%|██████████| 2647/2647 [00:00<00:00, 4803.51it/s]


In [3]:
print("Embedding documents... This should take a few minutes")

embedding_model = HuggingFaceEmbeddings(
    model_name="thenlper/gte-small",
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}, 
    )

vectordb = FAISS.from_documents(
    documents=docs_processed,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE
    )

  embedding_model = HuggingFaceEmbeddings(


Embedding documents... This should take a few minutes


In [4]:
user_query = "How to create a pipeline object?"
query_vector = embedding_model.embed_query(user_query)

In [5]:
print(f"\nStarting retrieval for {user_query=}...")
retrieved_docs = vectordb.similarity_search(query=user_query, k=5)
print("\n==================================Top document==================================")
print(retrieved_docs[0].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[0].metadata)


Starting retrieval for user_query='How to create a pipeline object?'...

## Available Pipelines:
{'source': 'diffusers', 'start_index': 1782}


LLM

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.52it/s]


In [7]:
from transformers import pipeline

text_generation_pipeline  = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

Device set to use cuda:0


In [8]:
text_generation_pipeline("What is 4+4? Answer:")[0]["generated_text"]

' 8 (Four apples and four oranges make eight fruits.)\n\nWhat is 3+3? Answer: 6 (Three apples and three pears make six fruits.)\n\nWhat is 5+1? Answer: 6 (Five flowers and one bush make six plants.)\n\nWhat is 2+2? Answer: 4 (Two birds and two nests make four groups.)\n\nWhat is 7+3? Answer: 10 (Seven trees and three benches make ten pieces of outdoor equipment.)\n\nWhat is 6+1? Answer: 7 (Six cars and one traffic light make seven things you see on the street.)\n\nWhat is 9+1? Answer: 10 (Nine building and one stop sign make ten things you see in a town.)\n\nWhat is 8+2? Answer: 10 (Eight telephones and two computers make ten office supplies.)\n\nWhat is 10+0? Answer: 10 (Ten fingers on your hands make ten body parts.)\n\nWhat is 5+5? Answer: 10 (Five legs on each ant makes ten legs on ten ants.)\n\nWhat is 10+0? Answer: 10 (Ten toes on your feet make ten body parts.)\n\nWhat is 10+0? Answer: 10 (Ten pets in your house make ten animals.)\n\nWhat is 10+0? Answer: 10 (Ten seeds in a pump

In [9]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
prompt = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(prompt)

<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}</s>
<|assistant|>



In [10]:
retrieved_docs_text = [doc.page_content for doc in retrieved_docs]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = prompt.format(question="How to create a pipeline object?", context=context)

# Redact an answer
answer = text_generation_pipeline(final_prompt)[0]["generated_text"]
print(answer)

To create a pipeline object, you can follow these steps:

1. For predefined pipelines provided by Hugging Face, you can directly import and use them as shown in Document 1. Here, we are using the `tiny-random-wav2vec2` model for speech recognition. ```python
import transformers
from transformers import KeyDataset

pipe = transformers.pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
dataset = transformers.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")

for out in pipe(KeyDataset(dataset, "audio")):
    print(out)
 ```

2. If you want to create your own pipeline with custom components, you can define a subclass of `Pipeline` and pass it to the `pipeline()` function along with the required models and tokens. Here, we are creating a simple pipeline for text classification. ```python
import torch
from transformers import AutoTokenizer, BertForSequenceClassification

class TextClassificationPipeline(transformers.Pipeline):
    

In [11]:
from ragatouille import RAGPretrainedModel

reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

  self.scaler = torch.cuda.amp.GradScaler()


In [12]:
from transformers import Pipeline
from typing import Optional, List, Tuple

def answer_with_rag(
    question: str,
    llm: Pipeline,
    knowledge_index: vectordb,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
) -> Tuple[str, List[Document]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = prompt.format(question=question, context=context)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs

In [13]:
question = "how to create a pipeline object?"

In [14]:
answer, relevant_docs = answer_with_rag(question, text_generation_pipeline, vectordb, reranker=reranker)

=> Retrieving documents...


  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 34.34it/s]

=> Generating answer...





In [15]:
print("==================================Answer==================================")
print(f"{answer}")
print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc)

To create a pipeline object, follow these steps:

1. Import the necessary modules from Hugging Face Transformers library:

   ```python
   from transformers import pipeline
   ```

2. Instantiate the pipeline object by passing the name or path of the pretrained model and the desired task as arguments to the `pipeline()` function. For example, to create a pipeline for named entity recognition using the `hf-internal-testing/bert-base-cased` model, you would write:

   ```python
   pipe = pipeline(model="hf-internal-testing/bert-base-cased", task="ner")
   ```

3. Load the dataset you want to process using the `load_dataset()` function provided by the library. Here's an example:

   ```python
   from transformers import AutoTokenizer
   from datasets import load_dataset

   tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/bert-base-cased")
   dataset = load_dataset("gluonnlp/wikitext-103-en", "train[:500]")
   ```

4. Pass the loaded dataset to the pipeline object to perform