In [58]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


# Data collection
### Load documents

In [59]:
LOAD_LOCALLY = False

if LOAD_LOCALLY:
    from pathlib import Path
    from datasets import Dataset

    docs = []
    sources = []
    for p in Path("./data/datasets/huggingface_docs/").iterdir():
        if not p.is_dir():
            with open(p) as f:
                # the first line is the source of the text
                source = f.readline().strip().replace('source: ', '').replace('https://github.com/', '')
                content = f.read()[2:] # Remove the initial '\n'
                if len(content) > 0:
                    docs.append(content)
                    sources.append(source)
        # break

    ds = Dataset.from_dict({"text": docs, "source": sources})
    ds.to_csv('./data/huggingface_doc.csv')
    print(f'number of documents: {len(ds)}')

else:
    from datasets import load_dataset

    ds = load_dataset("A-Roucher/huggingface_doc", split='train')

# Setup evaluation pipeline
In this part, we build a synthetic dataset of questions and associated contexts.



### Prepare source documents

We use Langchain's `RecursiveCharacterTextSplitter`, which makes efficient use of code language detection to make better splits.

In [60]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument

langchain_docs = [LangchainDocument(page_content=doc['text'], metadata={'source': doc['source']}) for doc in tqdm(ds)]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, chunk_overlap=200, add_start_index=True, separators=["\n\n", "\n", ".", " ", ""]
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

  0%|          | 0/2647 [00:00<?, ?it/s]

### Setup chains for question generation

In [64]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

question_relatedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here is the question.

Question: {question}\n
Answer::: """

chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0.2)
QA_generation_prompt = ChatPromptTemplate.from_template(QA_generation_prompt)
QA_generation_chain = QA_generation_prompt | chat_model

question_relatedness_critique_prompt = ChatPromptTemplate.from_template(question_relatedness_critique_prompt)
question_relatedness_critique_chain = question_relatedness_critique_prompt | chat_model

question_relevance_critique_prompt = ChatPromptTemplate.from_template(question_relevance_critique_prompt)
question_relevance_critique_chain = question_relevance_critique_prompt | chat_model

In [65]:
import random

outputs = []
for context in tqdm(random.sample(langchain_docs, 30)):
    # Generate QA couple
    output_QA_couple = QA_generation_chain.invoke({"context": context.page_content}).content
    try:
        question = output_QA_couple.split('Factoid question: ')[1].split('Answer: ')[0]
        answer = output_QA_couple.split('Answer: ')[1]
    except:
        continue

    # Critique QA couple
    question_relatedness_evaluation = question_relatedness_critique_chain.invoke({"context": context.page_content, "question": question}).content
    question_relevance_evaluation = question_relevance_critique_chain.invoke({"question": question}).content

    try:
        relatedness_score = int(question_relatedness_evaluation.split('Total rating: ')[1][0])
        relatedness_eval = question_relatedness_evaluation.split('Total rating: ')[0].split('Evaluation: ')[1]
        relevance_score = int(question_relevance_evaluation.split('Total rating: ')[1][0])
        relevance_eval = question_relevance_evaluation.split('Total rating: ')[0].split('Evaluation: ')[1]

        outputs.append(
            {
                "context": context.page_content,
                "question": question,
                "answer": answer,
                "relatedness_score": relatedness_score,
                "relatedness_eval": relatedness_eval,
                "relevance_score": relevance_score,
                "relevance_eval": relevance_eval,
                "source_doc": context.metadata['source'],
            }
        )
    except:
        continue

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

generated_questions = pd.DataFrame.from_dict(outputs)
generated_questions

In [None]:
generated_questions = generated_questions.loc[(generated_questions['relatedness_score'] >= 4) & (generated_questions['relevance_score'] >= 3)]
generated_questions

In [None]:
generated_questions.to_excel("generated_questions.xlsx")

14

# Build RAG System
## 1. Retriever - embeddings
Here we use Langchain vector databases since it offers a convenient FAISS index and allows us to keep document metadata throughout the processing.



Options:
- change embedding model
- normal embeddings vs instruct embeddings
- Hyde
- reranker

### Preprocessing

In this part, we split the documennts from our knowledge base into smaller chunks which will be the snippets that will support our answer. The goal is to have semantically relevant snippets: not too small to be sufficient for supporting an answer, and not too large in order to be centered around a key information.

We do this chunking with the `haystack` library, which offers a good `PreProcessor` class.

[This space](https://huggingface.co/spaces/A-Roucher/chunk_visualizer) lets you visualize how different chunking options affect the chunks you get, to help you tune this step.

Options:
- split respecting sentence boundaries
- semantic splits

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument

langchain_docs = [LangchainDocument(page_content=doc['text'], metadata={'source': doc['source']}) for doc in tqdm(ds)]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True, separators=["\n\n", "\n", ".", " ", ""]
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

  0%|          | 0/2647 [00:00<?, ?it/s]

Preprocessing:   0%|          | 0/2647 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing:   3%|▎         | 80/2647 [00:00<00:13, 196.63docs/s]Document 9c4032da1045295f2cfa5f5107e4aab1 is 11751 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document afd40958e2932034ec2f6f7d7b629dec is 11634 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing:   5%|▍         | 125/2647 [00:00<00:13, 183.71docs/s]Document afd40958e2932034ec2f6f7d7b629dec is 11634 characters long after preprocessing, where the maximum length should be 1

Number of chunks: 35244





In [None]:
USE_INSTRUCT_EMBEDDINGS = False

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings

if not USE_INSTRUCT_EMBEDDINGS:
    model_name = 'BAAI/bge-base-en-v1.5'
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)

else:
    model_name = "hkunlp/instructor-large"
    embed_instruction = "Represent the Hugging Face library documentation"
    query_instruction = "Query the most relevant piece of information from the Hugging Face documentation"

    embedding_model = HuggingFaceInstructEmbeddings(
        model_name=model_name,
        embed_instruction=embed_instruction,
        query_instruction=query_instruction
    )

In [None]:
embeddings = embedding_model.embed_documents(texts=[d.page_content for d in langchain_docs[:1000]])

In [None]:
from langchain.vectorstores import FAISS

index = FAISS.from_documents(langchain_docs[:1000], embedding_model)

index_name = 'index_1000'
index.save_local(f'./data/indexes/{index_name}/')

In [None]:
index = FAISS.load_local(f'./data/indexes/{index_name}/', embedding_model)

#### Test retrieval:

In [None]:
docs = index.similarity_search(query='how to create a pipeline object?', k=5)
print(docs[0].page_content)
print(docs[0].metadata)

In [None]:
knowledge_index = FAISS.load_local(f'./data/indexes/{index_name}/', embedding_model)

from sentence_transformers import CrossEncoder

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

# Reader - LLM
Options:
- zero-shot vs few-shot prompting (cf [resource](https://cookbook.openai.com/examples/fine-tuned_qa/ft_retrieval_augmented_generation_qdrant#6-using-qdrant-to-improve-rag-prompt))
- tune the number of examples retrieved
- make conversational

In [None]:
prompt_template = """
<|system|>
Using the information contained in the context, 
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
  </s>
<|assistant|>
"""

In [None]:
from transformers import pipeline

llm = pipeline("text-generation", model='HuggingFaceH4/zephyr-7b-beta')

llm('Ok,', max_new_tokens=512)


In [None]:
import os

HF_TOKEN = os.environ.get('HF_TOKEN')

In [None]:
import requests
import os

HF_TOKEN = os.getenv('HF_TOKEN')
API_URL = "https://dxsuz0i09l5zzjh1.us-east-1.aws.endpoints.huggingface.cloud"

headers = {
	"Authorization": f"Bearer {HF_TOKEN}",
	"Content-Type": "application/json"
}

def llm(question):
	payload = {
		"inputs": question,
		"max_new_tokens": 2000,
	}
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

In [None]:
def answer_question(question, llm, num_retrieved_docs: int = 15, num_reranked_docs: int = 7):
    # Gather documents with retriever
        
    relevant_docs = knowledge_index.similarity_search(
        query=question,
        k=num_retrieved_docs
    )

    # Chosse the most relevant documents with reranker
    cross_encoding_predictions = reranker.predict(
        [(question, doc.page_content) for doc in relevant_docs]
    )
    relevant_docs = [
        doc for _, doc in sorted(
            zip(cross_encoding_predictions, relevant_docs),
            reverse=True, key = lambda x: x[0]
        )
    ]
    relevant_docs = relevant_docs[:num_reranked_docs]

    # Build the final prompt
    context = '\nExtracted documents:\n'
    context += ''.join([f"{str(i)}: " + doc.page_content for i, doc in enumerate(relevant_docs)])

    final_prompt = prompt_template.format(
        context=context,
        question=question
    )
    print('Finished retrieving')
    # Redact an answer
    full_answer = llm(final_prompt)[0]['generated_text']
    answer = full_answer[len(final_prompt):]
    print(full_answer, answer)

    return full_answer, relevant_docs

# Demo

In [None]:
question = "how to create a pipeline object?"

In [None]:
answer, relevant_docs = answer_question(question, llm)

In [None]:
def pretty_print_answer(answer, relevant_docs):
    print(f'Answer: {answer}')
    print('\n\nSource documents:')
    for doc in relevant_docs:
        print(f'{doc.metadata["source"]}')
        print(doc.page_content)

In [None]:
pretty_print_answer(answer, relevant_docs)

# Benchmarking the chosen system on your evaluation set