In [1]:
# NOTE: An OpenAI API key must be set here for application initialization, even if not in use.
# If you're not utilizing OpenAI models, assign a placeholder string (e.g., "not_used").
import os
os.environ["OPENAI_API_KEY"] = ""

In [2]:
import time
import random
import numpy as np
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
import sacrebleu
import json

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from raptor import RetrievalAugmentation 

2024-06-12 17:16:25,152 - Loading faiss.
2024-06-12 17:16:25,177 - Successfully loaded faiss.


In [5]:
dataset = load_dataset("deepmind/narrativeqa", split="test[:1000]")

In [6]:
dataset = sorted(dataset, key=lambda x: x['document']['id'])

In [23]:
def split_text_preserve_sentences(text, ratio=0.8):
    sentences = sent_tokenize(text)
    split_idx = int(len(sentences) * ratio)
    return ' '.join(sentences[:split_idx]), ' '.join(sentences[split_idx:])

# Function to calculate ROUGE scores
def calculate_rouge_scores(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {key: [] for key in ['rouge1', 'rouge2', 'rougeL']}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        for key in scores:
            rouge_scores[key].append(scores[key].fmeasure)
    return {key: np.mean(values) for key, values in rouge_scores.items()}

# Function to calculate BLEU scores
def calculate_bleu_scores(predictions, references):
    bleu1_scores = []
    bleu4_scores = []
    
    for pred, ref in zip(predictions, references):
        # Calculate BLEU-1
        bleu1 = sacrebleu.corpus_bleu([pred], [[ref]], smooth_method='exp', smooth_value=0.0, use_effective_order=False).precisions[0]
        bleu1_scores.append(bleu1)
        
        # Calculate BLEU-4
        bleu4 = sacrebleu.corpus_bleu([pred], [[ref]], smooth_method='exp', smooth_value=0.0, use_effective_order=False).score
        bleu4_scores.append(bleu4)
    
    # Return the average BLEU-1 and BLEU-4 scores
    return {
        "bleu1": np.mean(bleu1_scores),
        "bleu4": np.mean(bleu4_scores)
    }

In [None]:
times = {}

# Placeholder for all QA results
all_full_rouge_scores = []
all_full_bleu1_scores = []
all_full_bleu4_scores = []
all_partial_rouge_scores = []
all_partial_bleu1_scores = []
all_partial_bleu4_scores = []

time_diff = []
magnitude_diff = []

# Placeholder to avoid rebuilding trees
full_trees = {}
partial_trees = {}

# Main processing
for i, row in enumerate(dataset):
    print("Processing ROW : ", i)
    doc_id = row["document"]["id"]
    text = row["document"]["summary"]["text"]
    question = row["question"]["text"]
    answers = [ans["text"] for ans in row["answers"]]

    # Split the text into 4/5 and 1/5 while preserving full sentences
    text_4_5, text_1_5 = split_text_preserve_sentences(text, ratio=0.8)

    # Build tree and evaluate for full text if not already built
    if doc_id not in full_trees:
        # Full text tree
        start_time = time.time()
        RA = RetrievalAugmentation() 
        RA.add_documents(text) 
        full_build_time = time.time() - start_time
        # Store tree
        full_trees[doc_id] = RA

        # Build tree for 4/5 text and add 1/5 text
        PartialTree = RetrievalAugmentation() 
        PartialTree.add_documents(text_4_5) 

        # Adding 1/5 text
        start_time = time.time()
        PartialTree.add_to_existing(text_1_5)
        add_1_5_time = time.time() - start_time

        # Store tree
        partial_trees[doc_id] = PartialTree

        # Record Times:
        times[doc_id] = {
            "full_build_time": full_build_time,
            "add_1_5_time": add_1_5_time,
            "difference": full_build_time - add_1_5_time,
            "multiplier": full_build_time / add_1_5_time
        }
        time_diff.append(full_build_time - add_1_5_time)
        magnitude_diff.append(full_build_time / add_1_5_time)

    RA = full_trees[doc_id]
    # Evaluate QA on full text tree
    full_predictions = [RA.answer_question(question=question)]
    # print("question ", question)
    # print("ouput", full_predictions)

    full_rouge_scores = calculate_rouge_scores(full_predictions, answers)
    full_bleu_score = calculate_bleu_scores(full_predictions, answers)
    # Save scores for averaging
    all_full_rouge_scores.append(full_rouge_scores)
    all_full_bleu1_scores.append(full_bleu_score["bleu1"])
    all_full_bleu4_scores.append(full_bleu_score["bleu4"])

    PartialTree = partial_trees[doc_id]
    # Evaluate QA on partial text tree
    partial_predictions = [PartialTree.answer_question(question)]
    # print("expeirmental ouput", full_predictions)
    
    partial_rouge_scores = calculate_rouge_scores(partial_predictions, answers)
    partial_bleu_score = calculate_bleu_scores(partial_predictions, answers)
    # print("right answer", answers)

    # Save scores for averaging
    all_partial_rouge_scores.append(partial_rouge_scores)
    all_partial_bleu1_scores.append(partial_bleu_score["bleu1"])
    all_partial_bleu4_scores.append(partial_bleu_score["bleu4"])

# Calculate average scores
average_full_rouge_scores = {key: np.mean([score[key] for score in all_full_rouge_scores]) for key in all_full_rouge_scores[0]}
average_full_bleu1_score = np.mean(all_full_bleu1_scores)
average_full_bleu4_score = np.mean(all_full_bleu4_scores)
average_partial_rouge_scores = {key: np.mean([score[key] for score in all_partial_rouge_scores]) for key in all_partial_rouge_scores[0]}
average_partial_bleu1_score = np.mean(all_partial_bleu1_scores)
average_partial_bleu4_score = np.mean(all_partial_bleu4_scores)


# Output results
print("Average Full Tree ROUGE Scores:", average_full_rouge_scores)
print("Average Full Tree BLEU1 Score:", average_full_bleu1_score)
print("Average Full Tree BLEU4 Score:", average_full_bleu4_score)
print("Average Partial Tree ROUGE Scores:", average_partial_rouge_scores)
print("Average Partial Tree BLEU1 Score:", average_partial_bleu1_score)
print("Average Partial Tree BLEU4 Score:", average_partial_bleu4_score)
print("Average Time Difference:", np.mean(time_diff))
print("Average Magnitude Difference:", np.mean(magnitude_diff))

In [None]:
# quick output test 
print("Average Full Tree ROUGE Scores:", average_full_rouge_scores)
print("Average Full Tree BLEU1 Score:", average_full_bleu1_score)
print("Average Full Tree BLEU4 Score:", average_full_bleu4_score)
print("Average Partial Tree ROUGE Scores:", average_partial_rouge_scores)
print("Average Partial Tree BLEU1 Score:", average_partial_bleu1_score)
print("Average Partial Tree BLEU4 Score:", average_partial_bleu4_score)
print("Average Time Difference:", np.mean(time_diff))
print("Average Magnitude Difference:", np.mean(magnitude_diff))