# Set up RAG system

- Embeddings: Jina Embeddings v2
- LLM: Mixtral 7b-instruct 0.1

In [89]:
!pip install -q llama-index-embeddings-huggingface llama-index-llms-langchain
!pip install -q llama-index llama-index-embeddings-jinaai llama-index-llms-huggingface "huggingface_hub[inference]"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Load docs and topics/qs/as

In [19]:
import pickle

with open("docs-all-chunks", "rb") as file:
    docs = pickle.load(file)

In [83]:
with open("topics_questions_answers.pkl", "rb") as file:
    qas = pickle.load(file)

## Configure system

In [7]:
import getpass

In [8]:
HF_TOKEN = getpass.getpass()

 ········


In [114]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embedding_model_name = "jinaai/jina-embeddings-v2-base-en"
hf_embedding_model = HuggingFaceEmbedding(model_name=embedding_model_name)

# embedding_model_name = "jinaai/jina-embeddings-v3"
# llm_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [117]:
from llama_index.core import Settings

Settings.embed_model = hf_embedding_model
# Settings.llm = mixtral_llm

## Generate embeddings

## Create indexes

In [177]:
from llama_index.core import VectorStoreIndex
from llama_index.core.readers import StringIterableReader
from llama_index.core.schema import Document

def create_index(docs, chunk_strategy):
  all_docs_chunks = []
  for doc in docs:
    all_docs_chunks += doc.chunks[chunk_strategy]

  llama_index_chunks = StringIterableReader().load_data(all_docs_chunks)
  # print(llama_index_chunks)

  index = VectorStoreIndex.from_documents(
      documents=llama_index_chunks, service_context=Settings
  )

  # index = VectorStoreIndex.from_documents(
  #     documents=all_docs_chunks, service_context=Settings
  # )

  print(f"{chunk_strategy} index: {len(index.docstore.docs)} items")

  return index

In [175]:
chunk_strategies

dict_keys(['simple_chunking', 'cot_topic_chunking', 'summary_chunking', 'jina-segmenter-api'])

In [201]:
# docs_index = indexes["simple_chunking"].docstore.docs


    
# for key in docs_index.keys():
#     # print(type(docs_index[key]))
#     # print(dir(docs_index[key]))
#     doc = docs_index[key]
#     # print(doc.text)
#     doc.embedding = hf_embedding_model.get_text_embedding(doc.text)
#     # doc.embedding = hf_embedding_model.get_text_embedding(
#     #     doc.get_content(metadata_mode="all")
#     # )
#     # doc.embedding = embedding

## Create embeddings

This took a lot of messing around

In [251]:
indexes["simple_chunking"].vector_store

AssertionError: 

In [None]:
# strat = "simple_chunking"
# test_doc = list(indexes[strat].docstore.docs.values())[0]
# # print(test_doc)
# print(test_doc.embedding)
# print(test_doc.get_content(metadata_mode="all"))
# test_doc.embedding = hf_embedding_model.get_text_embedding(
#     test_doc.get_content(metadata_mode="all")
# )
# print(test_doc.embedding)

In [264]:
# debug: check total chunks per segmentation method

for strat in chunk_strategies:
    print(strat.upper())
    # print("CHUNK COUNTS")
    all_chunks = []
    for doc in docs:  
        for chunk in doc.chunks[strat]:
            all_chunks.append(chunk)
    
    print(len(all_chunks), "chunks")

    # print("INDEX COUNTS")
    print(len(indexes[strat].docstore.docs), "in index")

SIMPLE_CHUNKING
214 chunks
253 in index
COT_TOPIC_CHUNKING
322 chunks
341 in index
SUMMARY_CHUNKING
233 chunks
252 in index
JINA-SEGMENTER-API
7801 chunks
7801 in index


In [162]:
chunk_strategies = docs[0].chunks.keys()
indexes = {}

In [None]:
for strategy in chunk_strategies:
    print(f"Creating index for {strategy}")
    indexes[strategy] = create_index(docs, strategy)

In [None]:
with open("indexes.pkl", "wb") as file:
    pickle.dump(indexes, file)

### Examine indexes

In [217]:
query_engine = indexes["simple_chunking"].as_query_engine()
response = query_engine.query("What are embeddings?")

In [221]:
response = query_engine.query("What is colbert?")

In [222]:
response

Response(response='\nThe query "What is colbert" is ambiguous and could refer to several things, such as Stephen Colbert, the American comedian, writer, producer, and television host, or Colbert, a city in the Charente-Maritime department in southwestern France. Without additional context, it is not possible to provide a specific answer.', source_nodes=[NodeWithScore(node=TextNode(id_='b224fc2e-3f7d-4a5a-ad80-5b3230325b43', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='3c1bf72e-3c2b-4b96-b294-2e337061b610', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='984bad5d6d8e1227042d3486db7f14387979e7fd1ab8e409043dfcfce68301d1')}, text='The Lost Context Problem The simple RAG pipeline of chunking-embedding-retrieving-generating is not without its challenges. Specifically, this process can destroy long-distance contextual dependencies. In other words, when relevant info

In [228]:
from pprint import pprint

pprint(indexes["simple_chunking"].docstore.docs['b0feeb0e-2e32-4c5f-bbaf-3addf91a1983'])
pprint(indexes["simple_chunking"].docstore.docs['b0feeb0e-2e32-4c5f-bbaf-3addf91a1983'].embedding)

TextNode(id_='b0feeb0e-2e32-4c5f-bbaf-3addf91a1983', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ff3d3722-990f-451e-bfc2-4c69e97a4139', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='7f21ea64de80c8c2da22ee9774cfb1f0569e481e1285cff57fe1c0adaa36d949')}, text='search notifications NEWS PRODUCTS COMPANY star Featured Press release September 11, 2024 Reader-LM: Small Language Models for Cleaning and Converting HTML to Markdown Reader-LM-0.5B and Reader-LM-1.5B are two novel small language models inspired by Jina Reader, designed to convert raw, noisy HTML from the open web into clean markdown. Jina AI • 12 minutes read jinaai/reader-lm-0.5b · Hugging Face We’re on a journey to advance and democratize artificial intelligence through open source and open science. jinaai/reader-lm-1.5b · Hugging Face We’re on a journey to advance and democratize artificial intellig

## Set up generator

In [223]:
from llama_index.llms.huggingface import HuggingFaceInferenceAPI

mixtral_llm = HuggingFaceInferenceAPI(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", token=HF_TOKEN)

Settings.llm = mixtral_llm

  mixtral_llm = HuggingFaceInferenceAPI(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", token=HF_TOKEN)


## Ask questions

In [136]:
from llama_index.core import PromptTemplate

qa_prompt_tmpl = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query. Please be brief, concise, and complete.\n"
    "If the context information does not contain an answer to the query, "
    "respond with \"No information\"."
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt = PromptTemplate(qa_prompt_tmpl)

In [224]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    # service_context=Settings,
    # settings=Settings,
    text_qa_template=qa_prompt,
    response_mode="compact",
)

## Test answering LLM-generated questions

In [225]:
def query_index(index, question, top_k=3):
  # configure retriever
  retriever = VectorIndexRetriever(
      index=index,
      similarity_top_k=top_k
      )

  # assemble query engine
  query_engine = RetrieverQueryEngine(
      retriever=retriever,
      response_synthesizer=response_synthesizer,
  )

  answer = query_engine.query(question).response.strip()

  return answer

In [148]:
index_names = indexes.keys()

In [226]:
for row in qas[0:10]:
    print("# Topic:", row["topic"].upper(), "\n")
    
    for i, q in enumerate(row["questions"], start=1):
        print(f"## Question {i}:", q["question"], "\n")
        ground_truth = q["answer"]
        print("### Ground truth\n")
        print(ground_truth, "\n")
        q["answers"] = {}
        for index_name in index_names:
            # print(f"Querying {index_name}")
            answer = query_index(indexes[index_name], q["question"])
            print("###", index_name, "\n")
            print(answer, "\n")
            q["answers"][index_name] = answer

            print("#### Referenced chunks")
            retriever = VectorIndexRetriever(
            index=indexes[index_name],
            similarity_top_k=3,
            )
            retrieved_texts = retriever.retrieve(q["question"])
            for i, rt in enumerate(retrieved_texts):
                print(f"##### Chunk {i+1}:\n\n{rt.text}\n\n")
        
        print("---")

# Topic: READER-LM: HTML TO MARKDOWN CONVERSION 

## Question 1: What are the main steps involved in converting HTML to Markdown using Reader-LM? 

### Ground truth

The main steps in converting HTML to Markdown using Reader-LM involve fetching the source of the webpage with a headless Chrome browser, extracting the main content using Mozilla's Readability package, and then converting the cleaned HTML to Markdown using regex and the Turndown library. 

### simple_chunking 

1. Extract text from HTML using the Reader module.
2. Convert the extracted text to Markdown format using the LM (Language Model) module.

Context: 1. Video-title: Offshore Wind Farm Technology - Course Introduction, transcript-segment: Since the first offshore wind farm commissioned in 1991 in Denmark, scientists and engineers have adapted and improved the technology of wind energy to offshore conditions. This is a rapidly evolving field with installation of increasingly larger wind turbines in deeper waters. At se

KeyboardInterrupt: 

In [76]:
qas[0]["questions"]

for row in qas:
    print(row["topic"].upper())
    for q in row["questions"]:
        print(q["question"])
        for ans in q["answers"]:
            print(ans)

READER-LM: HTML TO MARKDOWN CONVERSION
What are the main steps involved in converting HTML to Markdown using Reader-LM?
simple_chunking
How does Reader-LM address the issue of long-context support?
simple_chunking
What were the limitations of the initial Jina Reader API that led to the development of Reader-LM?
simple_chunking
What metrics are used to evaluate the performance of Reader-LM models?
simple_chunking
What are the model specifications for Reader-LM-0.5B and Reader-LM-1.5B?
simple_chunking
JINA COLBERT V2: MULTILINGUAL RETRIEVAL MODEL
What improvements does Jina ColBERT v2 offer over its predecessor?


KeyError: 'answers'

In [None]:
# all questions in one list

questions = []

for doc in docs:
  for question in doc.questions:
    questions.append(question["question"])

In [None]:
questions

['What are the main improvements of Jina-ColBERT-v2 over the original ColBERT-v2 and jina-colbert-v1-en?',
 'How does Jina-ColBERT-v2 handle multilingual data and what languages does it support?',
 'What is Matryoshka Representation Learning and how does it benefit Jina ColBERT v2?',
 'What are the challenges of the simple RAG pipeline of chunking-embedding-retrieving-generating?',
 'How does the Late Chunking approach differ from the naive chunking strategy in generating chunk embeddings?',
 'What is the correlation between the average length of documents and the effectiveness of late chunking in improving nDCG scores?',
 'What is the purpose of multimodal models in AI, and how do they differ from single-mode models?',
 "What is the 'modality gap' in multimodal models, and how does it affect the performance of CLIP-style models?",
 'What are the three major sources behind the modality gap, as identified by Liang et al. [2022]?',
 'What are the constraints of the zero-shot setting in t

In [None]:
for index in indexes:
  print(f"Asking {index.name}")
  for question in questions:
    print(f"- {question}")
    query_index(index, question)

Asking jina-segmenter-api
- What are the main improvements of Jina-ColBERT-v2 over the original ColBERT-v2 and jina-colbert-v1-en?
Asking jina-segmenter-api
- How does Jina-ColBERT-v2 handle multilingual data and what languages does it support?
Asking jina-segmenter-api
- What is Matryoshka Representation Learning and how does it benefit Jina ColBERT v2?
Asking jina-segmenter-api
- What are the challenges of the simple RAG pipeline of chunking-embedding-retrieving-generating?
Asking jina-segmenter-api
- How does the Late Chunking approach differ from the naive chunking strategy in generating chunk embeddings?
Asking jina-segmenter-api
- What is the correlation between the average length of documents and the effectiveness of late chunking in improving nDCG scores?
Asking jina-segmenter-api
- What is the purpose of multimodal models in AI, and how do they differ from single-mode models?
Asking jina-segmenter-api
- What is the 'modality gap' in multimodal models, and how does it affect th

In [None]:
pickle_object(os.path.join(pickle_dir, "indexes.pkl"), indexes, "qna")



In [None]:
# for doc in docs:
#   if doc.questions:
#     for q in doc.questions:
#       # print(q)
#       # print(q["question"])
#       # foo = get_answer("foo", jina_segmenter_api_index)
#       # foo = get_answer(q["question"], jina_segmenter_api_index)
#       q["answers"]["jina-segmenter-api"] = get_answer(q["question"], jina_segmenter_api_index)
#       # q["answer"]["chunking_strategy"] = "jina-segmenter-api"

In [None]:
# from pprint import pprint
# for question in docs[1].questions:
#   print(question["question"])
#   pprint(question["answers"])

#   print("---")

What are the main improvements of Jina-ColBERT-v2 over the original ColBERT-v2 and jina-colbert-v1-en?
{'jina-segmenter-api': 'Jina-ColBERT-v2 introduces several improvements over '
                       'the original ColBERT-v2 and jina-colbert-v1-en:\n'
                       '\n'
                       '1. Dynamic Vocabulary: Jina-ColBERT-v2 uses a dynamic '
                       'vocabulary, which allows it to handle '
                       'out-of-vocabulary words more effectively.\n'
                       '\n'
                       '2. Adaptive Hard Negative Sampling: This technique '
                       'helps the model to focus on harder negative samples '
                       'during training, leading to better performance.\n'
                       '\n'
                       '3. Support for Multilingual Search: Jina-ColBERT-v2 '
                       'supports multilingual search, enabling users to search '
                       'in multiple languages.\n'
       

## Evaluate answers

In [None]:
def evaluate_answers(docs):
  results = []
  for doc in docs:

    questions_and_answers_string = ""
    for i, question in enumerate(doc.questions):
      print(f"Question {i}: {question['question'].upper()}")
      questions_and_answers_string += f"Question {i}: {question['question'].upper()}"
      for key in question["answers"].keys():
        print(key)
        print(questions["answers"][key])

    results.append(questions_and_answers_string)

  return results

    # prompt = f"""
    # Your job is to evaluate three students who are answering questions based on a text. The text is as follows:

    # <begin text>
    # {doc.text}
    # <end text>

    # Here is each question and the answer from the students. Which student provided the most accurate and concise answer to the question?

    # <begin questions>
    # {doc.questions}
    # <end questions>
    # """
    # # Generate {count} technical question(s) about the given text that the text itself answers. Use this format:

    # #     [
    # #         "What are the key differences between dense and sparse retrieval methods in RAG systems?",
    # #         "How does a RAG model handle the integration of retrieved documents during the generation process?",
    # #         "What techniques can be used to optimize the retrieval phase in a RAG system for large-scale datasets?"
    # #     ]

    # # Present your output in only a structured JSON list of strings, with no other output or markdown formatting. Provide only the questions. Do not provide answers or context. Do not wrap your output in backticks. Text is as follows:

    # # {doc.text}
    # # """

    # response = mixtral_llm.complete(prompt)
    # # print(response)
    # # print(type(response))
    # print(response.json())

    # raw_output = response.text.strip()
    # # print(raw_output)

    # if raw_output[0] == '`':
    #   print("Code fencing detected. Fixing it")
    #   raw_output = raw_output.splitlines()[1:-1]
    #   raw_output = "\n".join(raw_output)

    # print(raw_output)

    # try:
    #   questions = json.loads(raw_output)
    # except:
    #   print("Failed to convert output to JSON")

    # [question.strip() for question in questions]

    # for question in questions:
    #   doc.questions.append({"question": question, "answers": {}})

    # # doc.questions = questions

In [None]:
output = evaluate_answers(docs)

Question 0: WHAT ARE THE MAIN IMPROVEMENTS OF JINA-COLBERT-V2 OVER THE ORIGINAL COLBERT-V2 AND JINA-COLBERT-V1-EN?
jina-segmenter-api


TypeError: 'NoneType' object is not subscriptable

In [None]:
doc0_answers = get_answers(docs[0], jina_segmenter_api_index, "jina-segmenter-api")

In [None]:
doc0_answers

[{'question: ': 'What are the key differences between the two-stage training process for Reader-LM models?',
  'answer': 'The two-stage training process for Reader-LM models involves first training a language model on a large corpus of text, followed by fine-tuning the model on a smaller dataset of HTML-to-Markdown pairs. The key difference is that in the first stage, the model is trained to predict the next token in a sequence, while in the second stage, the model is fine-tuned to classify tokens as either `1` or `0`, depending on whether they exist in both the input and output. This two-stage process allows the model to learn general language patterns in the first stage, and then specialize to the HTML-to-Markdown task in the second stage.',
  'strategy': 'jina-segmenter-api'},
 {'question: ': 'How does the data preparation process for Reader-LM models ensure high-quality training data?',
  'answer': 'The context information does not provide specific details on how the data preparati

In [None]:
# examine returned chunks
for question in docs[0].questions:
  retrieved_texts = retriever.retrieve(question)
  print(question.upper())
  for i, rt in enumerate(retrieved_texts):
    print(f"Text {i+1}:\n\n{rt.text}\n\n")

WHAT ARE THE KEY DIFFERENCES BETWEEN THE TWO-STAGE TRAINING PROCESS FOR READER-LM MODELS?
Text 1:

[](https://jina-ai-gmbh.ghost.io/content/images/2024/09/Qualitative-Evaluation-of-Reader-LM-vs-LLMs-and-Jina-Reader-API--1-.svg)


Text 2:

In the early stages of this project, we explored using an encoder-only architecture to tackle this task. As mentioned earlier, the HTML-to-Markdown conversion task appears to be primarily a "selective-copy" task. Given a training pair (raw HTML and markdown), we can label tokens that exist in both the input and output as `1`, and the rest as `0`. This converts the problem into a token classification task, similar to what is used in Named Entity Recognition (NER).


Text 3:

4.  **Markdown Syntax Usage**: Evaluated each model’s ability to correctly convert HTML elements such as `<a>` (links), `<strong>` (bold text), and `<em>` (italics) into their appropriate markdown equivalents.


HOW DOES THE DATA PREPARATION PROCESS FOR READER-LM MODELS ENSURE HIGH

In [None]:
# all_scores = []

# for idx in indexes:
#   scores = get_answer(idx, questions)
#   all_scores.append(scores)
#   # print(idx)

In [None]:
# get_answer(jina_segmenter_api_index, questions)

Text 1:

Try It Out                                                18
Cosine similarity:  0.5623142088441102


Text 2:

bases. Some of the subjects of particular interest to farmers include
Cosine similarity:  0.5026040589389831


---


## Why segmentation model so bad?

Let's check index

In [None]:
# os.makedirs("chunks", exist_ok=True)

# for doc in docs:
#   with open(f"./chunks/{doc.filename}", "w") as file:
#     for chunk_strategy in doc.chunks.keys():
#       text = ""
#       text += f"=== {chunk_strategy.upper()} - {len(docs[0].chunks[chunk_strategy])} chunks ===\n\n"
#       # text += "\n---\n"
#       for item in doc.chunks[chunk_strategy]:
#         text += item
#         text += "\n---\n"
#       file.write(text)

In [None]:
# with open("segmentation_model_chunks_farming.txt", "w") as file:
#   text = ""
#   for item in segmentation_model_index.docstore.docs.items():
#     text += item[1].text
#     text += "\n---\n"
#   file.write(text)

The Project Gutenberg eBook of Computers on the Farm This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.
Title: Computers on the Farm Author: Deborah Takiff Smith Release date: April 20, 2019 [eBook #59316] Language: English Credits: Produced by Tom Cosmas compiled from images provided by The Internet Archive *** START OF THE PROJECT GUTENBERG EBOOK COMPUTERS ON THE FARM *** Produced by Tom Cosmas compiled from images provided by The Internet Archive Transcribers Note Text emphasis denoted as _Italics_ and =Bold=.
Computers on the Farm Farm Uses for Computers, How to Select Software and Hardware, and O

In [None]:
def generate_evaluation_prompt(doc):
  evaluation_prompt = """
  You are an expert evaluator of RAG systems.

  Your job is to look at a given text and then look at several questions about the text.
  Each question has answers from four different RAG systems. You will evaluate these answers based on:
  - Accuracy (does it correctly answer the question based on original text)
  - Conciseness (does it get to the point? does it include any extraneous information)
  - Readability (how easy is it for an expert user to understand?)

  The RAG systems are called:
  - 'jina-segmenter-api'
  - 'segmentation-model'
  - 'langchain_semantic'
  - 'text-seg-lm'

  Give each RAG system a score out of ten for each answer it provides. At the end of your output, include a table with the final scores for each RAG system.

  Here is the text:
  <text begin>
  {}
  <text end>

  Here are the questions and answers:
  <questions and answers begin>
  {}
  <questions and answers end>
  """
  qna_text = ""

  for doc_question in doc.questions:
    # add question
    qna_text += f"Question: {doc_question['question']}\n"

    for index in indexes:
      # add each answer
      for idx_question in index.questions:
        if doc_question["question"] == idx_question["question"]:
          qna_text += f"{index.name}: {idx_question['answer']}\n"

    prompt = evaluation_prompt.format(doc.text, qna_text)

    return prompt

In [None]:
eval_prompts = []

for doc in docs:
  prompt = generate_evaluation_prompt(doc)
  eval_prompts.append(prompt)

In [None]:
with open("indexes-2024-09-19.pkl", "rb"