Ragas
- collects analytics
- can be costly https://openai.com/api/pricing/

Observations
- sometimes generates duplicate examples

Src
- https://github.com/explodinggradients/ragas
- https://docs.ragas.io/en/stable/getstarted/index.html

In [None]:
!pip install ragas langchain langchain_community unstructured python-magic-bin==0.4.14

In [7]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
# Quickstart
from datasets import Dataset 
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)

score = evaluate(dataset,metrics=[faithfulness,answer_correctness])
score.to_pandas()

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_correctness
0,When was the first super bowl?,[The First AFL–NFL World Championship Game was...,"The first superbowl was held on Jan 15, 1967","The first superbowl was held on January 15, 1967",1.0,0.999093
1,Who won the most super bowls?,"[The Green Bay Packers...Green Bay, Wisconsin....",The most super bowls have been won by The New ...,The New England Patriots have won the Super Bo...,0.0,0.981086


## Generate a testset

In [None]:
# For PDF parse
# Fix nltk punkt error: OSError: No such file or directory: '/Users/az/nltk_data/tokenizers/punkt/PY3_tab'
nltk.download('punkt_tab')
nltk.download('punkt')
!mv /Users/az/nltk_data/tokenizers/punkt/PY3/ /Users/az/nltk_data/tokenizers/punkt/PY3_tab/ # possibly wrong but works

In [13]:
path = "data/ignored/1_paper/"

In [15]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader(path)
documents = loader.load()
for document in documents:
    document.metadata['filename'] = document.metadata['source']
len(documents)

1

In [17]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4o-2024-08-06")
# critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=5, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

embedding nodes:   0%|          | 0/20 [00:00<?, ?it/s]

Generating:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
testset_path = "data/ragas_attention_testset.csv"
testset.to_pandas().to_csv(testset_path)

## Evaluate a testset

In [29]:
from datasets import load_dataset

testset = load_dataset("csv", data_files=testset_path)
testset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done'],
        num_rows: 5
    })
})

In [79]:
# Fix reading list from csv bc ValidationError at evaluate
import json
def str_to_list(row: dict) -> dict:
    row["contexts"] = json.loads(row["contexts"].replace("'", "\"")) # replace bc json only allows "", not ''
    return row
testset = testset.map(str_to_list)

```
ValidationError: 1 validation error for SingleTurnSample
retrieved_contexts
  value is not a valid list (type=type_error.list)
```

Field `retrieved_contexts` was read as string:

```'contexts': "[' 28.4...']"```

In [80]:
# https://docs.ragas.io/en/stable/concepts/metrics/index.html#ragas-metrics
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

result = evaluate(
    testset["train"],
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

result

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.


{'context_precision': 0.9000, 'faithfulness': nan, 'answer_relevancy': 0.1691, 'context_recall': 0.8500}

In [82]:
evaluation_path = "data/ragas_attention_evaluation.csv"
result.to_pandas().to_csv(evaluation_path)