# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval

### Raptor implementation on Qasper dataset

In [2]:
from raptor import RetrievalAugmentation 
from raptor import BaseSummarizationModel, BaseQAModel, BaseEmbeddingModel, RetrievalAugmentationConfig
from sentence_transformers import SentenceTransformer
from langchain_google_vertexai import ChatVertexAI
from vertexai.generative_models import HarmCategory,HarmBlockThreshold
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage

  from .autonotebook import tqdm as notebook_tqdm
2025-01-15 23:25:58,912 - Loading faiss with AVX2 support.
2025-01-15 23:25:59,542 - Successfully loaded faiss with AVX2 support.


In [3]:
# You can define your own Summarization model by extending the base Summarization Class. 
class GEMINISummarizationModel(BaseSummarizationModel):
    def __init__(self, model_name="gemini-1.5-pro"): 
        safety_settings = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        }
        # top_k = 50, original raptor | Not supported in ChatVertexAI (1-41)
        llm = ChatVertexAI(model=model_name, safety_settings= safety_settings, 
                           temperature=0.7, top_p=0.95, top_k=40, max_tokens=150,
                           project="sovereign-cloud-420714")
        
        sys_msg = SystemMessage(content = "You are a helpful assistant.")
        user_msg = HumanMessagePromptTemplate.from_template(
                template=[
                    {"type": "text", "text": "Write a summary of the following, including as many key details as possible: {context}:"},
                ],
            )
        chat_prompt = ChatPromptTemplate(messages=[sys_msg, user_msg])
        self.summarization_chain = chat_prompt | llm

    def summarize(self, context, max_tokens=150):
 
        output = self.summarization_chain.invoke({"context": context})
        summary = output.content.strip()
        return summary


In [4]:
class GEMINIQAModel(BaseQAModel):
    def __init__(self, model_name= "gemini-1.5-pro"):
        safety_settings = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        }
        # top_k = 50, original raptor | Not supported in ChatVertexAI (1-41)
        llm = ChatVertexAI(model=model_name, safety_settings= safety_settings, 
                           temperature=0.7, top_p=0.95, top_k=40, max_tokens=256,
                           project="sovereign-cloud-420714")
        
        sys_msg = SystemMessage(content = "You are Question Answering Portal.")
        user_msg = HumanMessagePromptTemplate.from_template(
                template=[
                    {"type": "text", "text": "Given Context: {context} Give the best full answer amongst the option to question {question}"},
                ]
            )
        chat_prompt = ChatPromptTemplate(messages=[sys_msg, user_msg])
        self.qa_chain = chat_prompt | llm

    def answer_question(self, context, question):
        output = self.qa_chain.invoke({"context":context, "question":question})
        answer = output.content.strip()
        return answer

In [5]:
class SBertEmbeddingModel(BaseEmbeddingModel):
    def __init__(self, model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"):
        self.model = SentenceTransformer(model_name)

    def create_embedding(self, text):
        return self.model.encode(text)


In [6]:
RAC = RetrievalAugmentationConfig(summarization_model=GEMINISummarizationModel(), qa_model=GEMINIQAModel(), embedding_model=SBertEmbeddingModel())

2025-01-15 23:27:14,359 - Use pytorch device_name: cpu
2025-01-15 23:27:14,360 - Load pretrained SentenceTransformer: sentence-transformers/multi-qa-mpnet-base-cos-v1


In [None]:
# Demo run

RA = RetrievalAugmentation(config=RAC)
with open('demo/sample.txt', 'r') as file:
    text = file.read()
    
RA.add_documents(text)
question = "How did Cinderella reach her happy ending?"
answer = RA.answer_question(question=question)
print("Answer: ", answer)

In [None]:
# Async experiments 

import asyncio
import json

from experiments import run

data_path = "data\\qasper-test-v0.3.0.json"
with open(data_path, "r") as f:
    test_data = json.load(f)
    samples = test_data[:25]
    print(len(samples))
results = await run(samples, RAC)

25
Length of samples 25


  0%|          | 0/25 [00:00<?, ?it/s]2025-01-16 00:07:09,742 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <__main__.GEMINISummarizationModel object at 0x0000023C98A4E090>
            Embedding Models: {'EMB': <__main__.SBertEmbeddingModel object at 0x0000023C99C87210>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2025-01-16 00:07:09,751 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
       

In [26]:
# Sync experiments

import json
import os

data_path = "data\\qasper-test-v0.3.0.json"
with open(data_path, "r") as f:
    test_data = json.load(f)
    i = 2
    selected_data = test_data[:i]
    for sample in selected_data:
        doc = sample["title"]
        doc += "\n\n"
        doc += "abstract" + "\n"
        doc += sample["abstract"] + "\n\n"
        paragraphs = sample["full_text"]["paragraphs"]
        names = sample["full_text"]["section_name"]
        for name, paras in zip(names, paragraphs):
            doc += name + "\n"
            doc += "\n".join(paras)
            doc += "\n\n"
            
        doc += "figures_and_tables\n"
        doc += "\n".join(sample["figures_and_tables"]["caption"])
        # print(doc)
        RA = RetrievalAugmentation(config=RAC)
        RA.add_documents(doc)
        results = []
        for idx in range(len(sample["qas"]["question"])):
            question_id = sample["qas"]["question_id"][idx]
            question = sample["qas"]["question"][idx]
            answer = RA.answer_question(question=question)
            # print("Question: ", question)
            # print("Answer: ", answer)
            # print("\n\n")
            results.append({"question_id": question_id, "question": question, 
                       "predicted_evidence": "", "predicted_answer":answer})
            
        paper_id = sample["id"]
        folder_path = "results/outputs"
        res_path = os.path.join(folder_path, f"{paper_id}.jsonl")
        with open(res_path, "w") as f:
            for res in results:
                json.dump(res, f)
                f.write("\n")  

2025-01-15 21:47:59,597 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <__main__.GEMINISummarizationModel object at 0x000001C144FB56D0>
            Embedding Models: {'EMB': <__main__.SBertEmbeddingModel object at 0x000001C119E0A410>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2025-01-15 21:47:59,597 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
           

Question:  How big is the ANTISCAM dataset? 
Answer:  **The AntiScam dataset consists of 220 human-human text conversations.**





Batches: 100%|██████████| 1/1 [00:00<00:00, 12.42it/s]
2025-01-15 21:50:44,709 - Using collapsed_tree


Question:  How is intent annotated?
Answer:  The text describes a **hierarchical intent annotation scheme** for dialogue systems, particularly effective for non-collaborative tasks like scam detection. Here's how it works:

1. **Intent Separation:** Intents are divided into two main categories:
   - **On-task Intents:** These represent key actions specific to the task. For instance, in the AntiScam dataset, on-task intents include *elicitation*, *providing_information*, and *refusal*.
   - **Off-task Intents:** These capture general conversational elements not tied to the specific task. Instead of creating task-specific categories, common dialogue acts are used. Examples include *open_question*, *yes_no_question*, *positive_answer*, *greeting*, *closing*, etc.

2. **Annotation Process:**
   - **On-task intents** are annotated directly based on the defined task-specific categories.
   - **Off-task intents** are labeled using the general dialogue act categories.

3. **Example:** In the A

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.64it/s]
2025-01-15 21:50:46,996 - Using collapsed_tree


Question:  What are the baselines outperformed by this work?
Answer:  The baselines outperformed by MISSA are:

* **TransferTransfo:**  A vanilla Transformer-based model. MISSA improves upon this by adding intent and slot classifiers, highlighting their importance.
* **Hybrid Model:** This model combines the vanilla TransferTransfo with elements of MISSA.  However, MISSA still outperforms it, suggesting the effectiveness of its overall architecture and design.





Batches: 100%|██████████| 1/1 [00:00<00:00,  2.25it/s]


Question:  What are the evaluation metrics and criteria used to evaluate the model performance?
Answer:  The evaluation of the dialogue system employs two main approaches: **Automatic Evaluation** and **Human Evaluation**, each encompassing several metrics:

**Automatic Evaluation:**

* **Perplexity:** This metric measures the language model's ability to predict the next word in a sequence, essentially assessing its fluency and understanding of language. Lower perplexity scores indicate better performance.

* **Response-Intent Prediction (RIP) & Response-Slot Prediction (RSP):**  These metrics evaluate the alignment between the system's responses and the user's intents. RIP focuses on recognizing the intention behind the user's input, while RSP aims to extract specific information (slots) relevant to the identified intent. 

* **Extended RIP & RSP (ERIP & ERSP):** These extend the basic RIP and RSP by acknowledging that multiple system intents might be valid responses to a single user 

In [7]:
# run evaluation script

import os

results_folder = "./results/outputs"
combine_file = "./results/combine.jsonl"

if not os.path.exists(results_folder):
    print(f"Results folder does not exist: {results_folder}")
    
with open(combine_file, 'w') as outfile:
    for file_name in os.listdir(results_folder):
        if not file_name.endswith(".jsonl"):
            continue
        file_path = os.path.join(results_folder, file_name)
        # print(f"Processing file: {file_path}")
        
        with open(file_path, 'r') as infile:
            for line in infile:
                outfile.write(line)
                


In [10]:
%run evaluation-qasper.py --predictions {combine_file} --gold ./data/qasper-test-v0.3.0.json

{
  "Answer F1": 0.17125569173872984,
  "Answer F1 by type": {
    "extractive": 0.24224082245395526,
    "abstractive": 0.14807017674597192,
    "boolean": 0.015831517792302106,
    "none": 0.0
  },
  "Evidence F1": 0.27058823529411763
}
