In [13]:
import pandas as pd
from rouge_score import rouge_scorer

In [14]:
naive_rag = pd.read_csv('results/naive_rag.csv', index_col=0)
no_rag = pd.read_csv('results/no_rag.csv', index_col=0)
prop_chunk_rag = pd.read_csv('results/prop_chunk_rag.csv', index_col=0)
small2big_rag = pd.read_csv('results/small2big_rag.csv', index_col=0)


In [15]:
def compute_rouge(row):
    reference = row['reference_answers']
    prediction = row['generated_answers']

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, prediction)

    return rouge_scores["rougeL"].fmeasure

In [16]:
def get_results(df):
    df["ROUGE-L"] = df.apply(compute_rouge, axis=1, result_type="expand")
    print(f'Average ROUGE: {round(df['ROUGE-L'].mean(), 2)}')
    print(f'Average Correctness: {df['correctness'].mean()}')
    print(f'Average Groundedness: {df['groundedness'].mean()}')
    print(f'Average Relevance: {df['relevance'].mean()}')
    print(f'Average Retrieval Relevance: {df['retrieval_relevance'].mean()}')
    print(f'Average Documents Length {df['documents'].apply(len).mean()}')

- Correctness (Response vs reference answer)
- Relevance (Response vs input)
- Groundedness (Response vs retrieved docs)
- Retrieval relevance (Retrieved docs vs input)

In [17]:
print('No RAG')
get_results(no_rag)
print('Time taken to answer 100 questions: 9 minutes')

No RAG
Average ROUGE: 0.29
Average Correctness: 1.56
Average Groundedness: 1.08
Average Relevance: 3.62
Average Retrieval Relevance: 2.83
Average Documents Length 8881.15
Time taken to answer 100 questions: 9 minutes


In [18]:
print('Naive RAG')
get_results(naive_rag)
print('Time taken to answer 100 questions: 14 minutes')

Naive RAG
Average ROUGE: 0.36
Average Correctness: 2.14
Average Groundedness: 1.23
Average Relevance: 4.08
Average Retrieval Relevance: 2.98
Average Documents Length 8848.48
Time taken to answer 100 questions: 14 minutes


In [19]:
print('Small2Big RAG')
get_results(small2big_rag)
print('Time taken to answer 100 questions: 21 minutes')

Small2Big RAG
Average ROUGE: 0.31
Average Correctness: 2.08
Average Groundedness: 1.12
Average Relevance: 3.65
Average Retrieval Relevance: 2.64
Average Documents Length 13398.68
Time taken to answer 100 questions: 21 minutes


In [20]:
print('Proposition RAG')
get_results(prop_chunk_rag)
print('Time taken to answer 100 questions: 367 minutes')

Proposition RAG
Average ROUGE: 0.34
Average Correctness: 1.88
Average Groundedness: 1.81
Average Relevance: 3.9
Average Retrieval Relevance: 2.99
Average Documents Length 346.45
Time taken to answer 100 questions: 367 minutes
