# For the evaluation, we employ Ragas which is a SOTA metrics for evaluating Retrieval Augmented Generation (RAG) pipelines. 

In [1]:
from datasets import Dataset 
import os
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

In [None]:
#os.environ["OPENAI_API_KEY"] = "your-openai-key"

In [None]:
data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)

In [3]:
score = evaluate(dataset,metrics=[faithfulness,answer_correctness])
score.to_pandas()

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_correctness
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,"The first superbowl was held on January 15, 1967",1.0,0.749096
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",The New England Patriots have won the Super Bo...,0.0,0.731086


In [3]:
from datasets import load_dataset

# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
amnesty_qa

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.72k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/70.8k [00:00<?, ?B/s]

Generating eval split: 0 examples [00:00, ? examples/s]

DatasetDict({
    eval: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 20
    })
})

In [4]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

In [5]:
from ragas import evaluate

result = evaluate(
    amnesty_qa["eval"],
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

result

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

{'context_precision': 0.9458, 'faithfulness': 0.4497, 'answer_relevancy': 0.9790, 'context_recall': 0.8204}

In [6]:
df = result.to_pandas()
df.head()

Unnamed: 0,question,ground_truth,answer,contexts,context_precision,faithfulness,answer_relevancy,context_recall
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ...",1.0,0.357143,0.987989,1.0
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[In recent years, there has been increasing pr...",1.0,0.111111,0.970326,1.0
2,Which private companies in the Americas are th...,The largest private companies in the Americas ...,"According to the Carbon Majors database, the l...",[The issue of greenhouse gas emissions has bec...,0.833333,0.0,1.0,1.0
3,What action did Amnesty International urge its...,Amnesty International urged its supporters to ...,Amnesty International urged its supporters to ...,"[In the case of the Ogoni 9, Amnesty Internati...",1.0,0.5,0.985524,1.0
4,What are the recommendations made by Amnesty I...,The recommendations made by Amnesty Internatio...,Amnesty International made several recommendat...,"[In recent years, Amnesty International has fo...",1.0,0.047619,0.989104,1.0


In [7]:
df.shape

(20, 8)

In [8]:
from datasets import Dataset 
from ragas.metrics import faithfulness
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,faithfulness
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,0.0
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",0.0


In [9]:
from datasets import Dataset 
from ragas.metrics import answer_relevancy
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[answer_relevancy])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,answer_relevancy
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,0.97532
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",0.943043


In [10]:
from datasets import Dataset 
from ragas.metrics import context_precision
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[context_precision])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truth,context_precision
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,"The first superbowl was held on January 15, 1967",1.0
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",The New England Patriots have won the Super Bo...,0.0


In [12]:
from ragas.metrics import ContextRelevancy
context_relevancy = ContextRelevancy()


Dataset({
    features: ['question','contexts'],
    num_rows: 25
})
dataset: Dataset

results = context_relevancy.score(dataset)

NameError: name 'features' is not defined

In [13]:
from datasets import Dataset 
from ragas.metrics import context_recall
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[context_recall])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truth,context_recall
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,"The first superbowl was held on January 15, 1967",1.0
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",The New England Patriots have won the Super Bo...,0.0


In [14]:
from datasets import Dataset 
from ragas.metrics import context_entity_recall
from ragas import evaluate

data_samples = {
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[context_entity_recall])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,contexts,ground_truth,context_entity_recall
0,[The First AFL–NFL World Championship Game was...,"The first superbowl was held on January 15, 1967",0.5
1,"[The Green Bay Packers...Green Bay, Wisconsin....",The New England Patriots have won the Super Bo...,0.0


In [15]:
from datasets import Dataset 
from ragas.metrics import answer_similarity
from ragas import evaluate


data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[answer_similarity])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,ground_truth,answer_similarity
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967","The first superbowl was held on January 15, 1967",0.996372
1,Who won the most super bowls?,The most super bowls have been won by The New ...,The New England Patriots have won the Super Bo...,0.924343


In [16]:
from datasets import Dataset 
from ragas.metrics import faithfulness, answer_correctness
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[answer_correctness])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,ground_truth,answer_correctness
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967","The first superbowl was held on January 15, 1967",0.749093
1,Who won the most super bowls?,The most super bowls have been won by The New ...,The New England Patriots have won the Super Bo...,0.981086


In [17]:
from datasets import Dataset 
from ragas.metrics.critique import harmfulness
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[harmfulness])
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,harmfulness
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,0
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",0


In [18]:
from ragas.llms.prompt import Prompt

qa_prompt = Prompt(
    name="question_generation",
    instruction="Generate a question for the given answer",
    examples=[
        {
            "answer": "The last Olympics was held in Tokyo, Japan.",
            "context": "The last Olympics was held in Tokyo, Japan. It is held every 4 years",
            "output": {"question":"Where was the last Olympics held?"},
        },
        {
            "answer": "It can change its skin color based on the temperature of its environment.",
            "context": "A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.",
            "output": {"question":"What unique ability does the newly discovered species of frog have?"},
        }
    ],
    input_keys=["answer", "context"],
    output_key="output",
    output_type="json",
)

In [19]:
print(qa_prompt.to_string())

Generate a question for the given answer

Examples:

answer: "The last Olympics was held in Tokyo, Japan."
context: "The last Olympics was held in Tokyo, Japan. It is held every 4 years"
output: ```{{"question": "Where was the last Olympics held?"}}```

answer: "It can change its skin color based on the temperature of its environment."
context: "A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment."
output: ```{{"question": "What unique ability does the newly discovered species of frog have?"}}```

Your actual task:

answer: {answer}
context: {context}
output: 



In [20]:
qa_prompt.format(answer="This is an answer", context="This is a context")

PromptValue(prompt_str='Generate a question for the given answer\n\nExamples:\n\nanswer: "The last Olympics was held in Tokyo, Japan."\ncontext: "The last Olympics was held in Tokyo, Japan. It is held every 4 years"\noutput: ```{"question": "Where was the last Olympics held?"}```\n\nanswer: "It can change its skin color based on the temperature of its environment."\ncontext: "A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment."\noutput: ```{"question": "What unique ability does the newly discovered species of frog have?"}```\n\nYour actual task:\n\nanswer: This is an answer\ncontext: This is a context\noutput: \n')

In [22]:
qa_prompt.save()

In [23]:
from ragas.utils import RAGAS_CACHE_HOME
Prompt._load(name="question_generation",language="english",cache_dir=RAGAS_CACHE_HOME)

ImportError: cannot import name 'RAGAS_CACHE_HOME' from 'ragas.utils' (/Users/jianhaozhang/miniconda3/envs/591/lib/python3.11/site-packages/ragas/utils.py)

In [24]:
from ragas.llms.prompt import Prompt
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms.base import LangchainLLMWrapper

openai_model = ChatOpenAI(model="gpt-3.5-turbo")

openai_model = LangchainLLMWrapper(openai_model)
noun_extractor = Prompt(
    name="noun_extractor",
    instruction="Extract the noun from given sentence",
    examples=[{
        "sentence":"The sun sets over the mountains.",
        "output":{"nouns":["sun", "mountains"]}
    }],
    input_keys=["sentence"],
    output_key="output",
    output_type="json"
)

In [25]:
adapted_prompt = noun_extractor.adapt(language="chinese",llm=openai_model)
print(adapted_prompt.to_string())

Extract the noun from given sentence

Examples:

sentence: "太阳在山上落山。"
output: ```{{"nouns": ["太阳", "山"]}}```

Your actual task:

sentence: {sentence}
output: 



In [26]:
from langchain_community.document_loaders import PubMedLoader

loader = PubMedLoader("liver", load_max_docs=10)
documents = loader.load()

ImportError: Could not import xmltodict python package. Please install it with `pip install xmltodict`.

In [27]:
from llama_index import download_loader

SemanticScholarReader = download_loader("SemanticScholarReader")
loader = SemanticScholarReader()
query_space = "large language models"
documents = loader.load_data(query=query_space,full_text=True,limit=10)

ModuleNotFoundError: No module named 'llama_index'

In [28]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# documents = load your documents

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Change resulting question type distribution
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# use generator.generate_with_llamaindex_docs if you use llama-index as document loader
testset = generator.generate_with_langchain_docs(documents, 10, distributions) 
testset.to_pandas()

  params = OrderedDict((param.name, param) for param in parameters)


NameError: name 'documents' is not defined

In [29]:
test_df = testset.to_pandas()
test_df.head()

NameError: name 'testset' is not defined