In [2]:
%load_ext autoreload
%autoreload 2

# Data collection
### Load documents

In [3]:
LOAD_LOCALLY = False


if LOAD_LOCALLY:
    from pathlib import Path
    from datasets import Dataset

    docs = []
    sources = []
    for p in Path("./data/datasets/huggingface_docs/").iterdir():
        if not p.is_dir():
            with open(p) as f:
                # the first line is the source of the text
                source = f.readline().strip().replace('source: ', '').replace('https://github.com/', '')
                content = f.read()[2:] # Remove the initial '\n'
                if len(content) > 0:
                    docs.append(content)
                    sources.append(source)
        # break

    ds = Dataset.from_dict({"text": docs, "source": sources})
    ds.to_csv('huggingface_doc.csv')
    print(f'number of documents: {len(ds)}')

else:
    from datasets import load_dataset

    ds = load_dataset("A-Roucher/huggingface_doc", split='train')

### Preprocessing
Options:
- split respecting sentence boundaries
- semantic splits

In [4]:
from haystack import Document
from tqdm.notebook import tqdm

haystack_docs = []
for doc in tqdm(ds):
    if doc['text'] is None:
        print(doc)
    haystack_docs.append(Document(content=doc['text'], meta={'source': doc['source']}))

  0%|          | 0/2647 [00:00<?, ?it/s]

In [5]:
from haystack import Document
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_overlap=5,
    split_respect_sentence_boundary=True,
)

haystack_docs = preprocessor.process(
    haystack_docs,
)
print(f'Number of chunks: {len(haystack_docs)}')

Preprocessing:   0%|          | 0/2647 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing:   3%|▎         | 80/2647 [00:00<00:13, 194.41docs/s]Document 9c4032da1045295f2cfa5f5107e4aab1 is 11751 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document afd40958e2932034ec2f6f7d7b629dec is 11634 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing:   5%|▍         | 123/2647 [00:00<00:14, 179.41docs/s]Document afd40958e2932034ec2f6f7d7b629dec is 11634 characters long after preprocessing, where the maximum length should be 1

Number of chunks: 35244





In [6]:
# we must convert haystack to langchain docs
from langchain.docstore.document import Document as LangchainDocument

langchain_docs = [LangchainDocument(page_content=doc.content, metadata=doc.meta) for doc in haystack_docs]

# Retriever - embeddings
Here we use Langchain vector databases since it offers a convenient FAISS index and allows us to keep document metadata throughout the processing.



Options:
- normal embeddings vs instruct embeddings
- Hyde
- reranker

In [7]:
USE_INSTRUCT_EMBEDDINGS = False

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings

if not USE_INSTRUCT_EMBEDDINGS:
    model_name = 'BAAI/bge-base-en-v1.5'
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)

else:
    model_name = "hkunlp/instructor-large"
    embed_instruction = "Represent the Hugging Face library documentation"
    query_instruction = "Query the most relevant piece of information from the Hugging Face documentation"

    embedding_model = HuggingFaceInstructEmbeddings(
        model_name=model_name,
        embed_instruction=embed_instruction,
        query_instruction=query_instruction
    )

In [9]:
embeddings = embedding_model.embed_documents(texts=[d.page_content for d in langchain_docs[:1000]])

In [10]:
from langchain.vectorstores import FAISS

index = FAISS.from_documents(langchain_docs[:1000], embedding_model)

index_name = 'index_1000'
index.save_local(f'./data/indexes/{index_name}/')

In [None]:
index = FAISS.load_local(f'./data/indexes/{index_name}/', embedding_model)

Test retrieval:

In [11]:
docs = index.similarity_search(query='how to create a pipeline object?', k=5)
print(docs[0].page_content)
docs[0].metadata

# load the pipeline
# make sure you're logged in with `huggingface-cli login`
model_id_or_path = "CompVis/stable-diffusion-v1-4"
scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")

# let's download an initial image
url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("horse.png")

# let's specify a prompt
source_prompt = "An astronaut riding a horse"
prompt = "An astronaut riding an elephant"

# call the pipeline
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.8,
guidance_scale=2,
source_guidance_scale=1,
).images[0]

image.save("horse_to_elephant.png")

# let's try a

{'source': 'huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/README.md',
 '_split_id': 8,
 '_split_overlap': []}

In [12]:
print(docs[0].page_content)

# load the pipeline
# make sure you're logged in with `huggingface-cli login`
model_id_or_path = "CompVis/stable-diffusion-v1-4"
scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")

# let's download an initial image
url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("horse.png")

# let's specify a prompt
source_prompt = "An astronaut riding a horse"
prompt = "An astronaut riding an elephant"

# call the pipeline
image = pipe(
prompt=prompt,
source_prompt=source_prompt,
image=init_image,
num_inference_steps=100,
eta=0.1,
strength=0.8,
guidance_scale=2,
source_guidance_scale=1,
).images[0]

image.save("horse_to_elephant.png")

# let's try a

In [20]:
knowledge_index = FAISS.load_local(f'./data/indexes/{index_name}/', embedding_model)

from sentence_transformers import CrossEncoder

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

Downloading config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Downloading tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Reader - LLM
- zero-shot vs few-shot prompting (cf [resource](https://cookbook.openai.com/examples/fine-tuned_qa/ft_retrieval_augmented_generation_qdrant#6-using-qdrant-to-improve-rag-prompt))
- tune the number of examples retrieved
- make conversational

In [60]:
prompt_template = """
<|system|>
Using the information contained in the context, 
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
  </s>
<|assistant|>
"""

In [103]:
from transformers import pipeline

llm = pipeline("text2text-generation", model='HuggingFaceH4/zephyr-7b-beta')

llm('Ok,', max_new_tokens=512)


In [78]:
import os

HF_TOKEN = os.environ.get('HF_TOKEN')

In [91]:
import requests
import os

HF_TOKEN = os.getenv('HF_TOKEN')
API_URL = "https://dxsuz0i09l5zzjh1.us-east-1.aws.endpoints.huggingface.cloud"

headers = {
	"Authorization": f"Bearer {HF_TOKEN}",
	"Content-Type": "application/json"
}

def llm(question):
	payload = {
		"inputs": question,
		"max_new_tokens": 2000,
	}
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = llm('Ok,')

In [92]:
output

[{'generated_text': ' so I\'m not really a fan of the whole "New Year, New Me" thing.'}]

In [98]:
def answer_question(question, llm, num_retrieved_docs: int = 15, num_reranked_docs: int = 7):
    # Gather documents with retriever
        
    relevant_docs = knowledge_index.similarity_search(
        query=question,
        k=num_retrieved_docs
    )

    # Chosse the most relevant documents with reranker
    cross_encoding_predictions = reranker.predict(
        [(question, doc.page_content) for doc in relevant_docs]
    )
    relevant_docs = [
        doc for _, doc in sorted(
            zip(cross_encoding_predictions, relevant_docs),
            reverse=True, key = lambda x: x[0]
        )
    ]
    relevant_docs = relevant_docs[:num_reranked_docs]

    # Build the final prompt
    context = '\nExtracted documents:\n'
    context += ''.join([f"{str(i)}: " + doc.page_content for i, doc in enumerate(relevant_docs)])

    final_prompt = prompt_template.format(
        context=context,
        question=question
    )
    print('Finished retrieving')
    # Redact an answer
    full_answer = llm(final_prompt)[0]['generated_text']
    answer = full_answer[len(final_prompt):]
    print(full_answer, answer)

    return full_answer, relevant_docs

In [99]:
question = "how to create a pipeline object?"

In [100]:
answer, relevant_docs = answer_question(question, llm)

Finished retrieving
To create a pipeline object in Hugging Face's Transformers library, you need to follow 


In [101]:
def pretty_print_answer(answer, relevant_docs):
    print(f'Answer: {answer}')
    print('\n\nSource documents:')
    for doc in relevant_docs:
        print(f'{doc.metadata["source"]}')
        print(doc.page_content)

In [102]:
pretty_print_answer(answer, relevant_docs)

Answer: To create a pipeline object in Hugging Face's Transformers library, you need to follow


Source documents:
huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/README.md
# load the pipeline
# make sure you're logged in with `huggingface-cli login`
model_id_or_path = "CompVis/stable-diffusion-v1-4"
scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")

# let's download an initial image
url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((512, 512))
init_image.save("horse.png")

# let's specify a prompt
source_prompt = "An astronaut riding a horse"
prompt = "An astronaut riding an elephant"

# call the pipeline
image = pipe(
prompt=prompt,
sou

# Benchmarking the chosen system on your evaluation set