### Libraries Required

In [None]:
# pip install llama_index html2text trulens_eval

### Adding API keys
We require OPENAI API Key for using GPT model & Evaluations

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "[REPLACE_WITH_YOUR_OPENAI_API_KEY]”

### Import from LlamaIndex and TruLens

In [None]:
from trulens_eval import Feedback, Tru, TruLlama
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI

tru = Tru()

### Creating a Simple LLM Application

This example uses LlamaIndex which internally uses an OpenAI LLM.

In [None]:
from llama_index import VectorStoreIndex
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleWebPageReader(
    html_to_text=True
).load_data(["https://mlds.analyticsindiamag.com/"])
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

### Sending first request

In [None]:
response = query_engine.query("When is MLDS2024 & it's in which city?")
print(response)

MLDS2024 is scheduled to take place on February 1 to 2, 2024. The conference will be held in Bengaluru, India.


### Initialize Feedback Function(s)

In [None]:
import numpy as np

# Initialize provider class
openai = OpenAI()

grounded = Groundedness(groundedness_provider=OpenAI())

# Define a groundedness feedback function
f_groundedness = Feedback(grounded.groundedness_measure_with_cot_reasons).on(
    TruLlama.select_source_nodes().node.text.collect()
    ).on_output(
    ).aggregate(grounded.grounded_statements_aggregator)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()

# Question/statement relevance between question and each context chunk.
f_qs_relevance = Feedback(openai.qs_relevance).on_input().on(
    TruLlama.select_source_nodes().node.text
    ).aggregate(np.mean)

✅ In groundedness_measure_with_cot_reasons, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In groundedness_measure_with_cot_reasons, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In qs_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In qs_relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


### Instrument app for logging with TruLens

In [None]:
tru_query_engine_recorder = TruLlama(query_engine,
    app_id='LlamaIndex_App1',
    feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])

In [None]:
# or as context manager
with tru_query_engine_recorder as recording:
    query_engine.query("When is MLDS2024 & it's in which city?")

### Streamlit Dashboard

In [None]:
tru.run_dashboard() # open a local streamlit app to explore

#tru.stop_dashboard() # stop if needed

### RAGAS

In [None]:
#! pip install ragas

eval_questions = [
    "When is MLDS2024 & it's in which city?", "What is MLDS?"
]

eval_answers = [
    "MLDS2024 is scheduled to take place on February 15 to 18, 2024. The conference will be held in Delhi, India.",
    "MLDS is a spaceship."
]

eval_answers = [[a] for a in eval_answers]


In [None]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

In [None]:
! pip install nest-asyncio
import nest_asyncio
nest_asyncio.apply()

from ragas.llama_index import evaluate

result = evaluate(query_engine, metrics, eval_questions, eval_answers)

evaluating with [faithfulness]


100%|██████████| 1/1 [00:15<00:00, 15.15s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:06<00:00,  6.91s/it]


evaluating with [context_precision]


100%|██████████| 1/1 [00:02<00:00,  2.51s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:04<00:00,  4.35s/it]


evaluating with [harmfulness]


100%|██████████| 1/1 [00:03<00:00,  3.67s/it]


In [None]:
print(result)

{'faithfulness': 1.0000, 'answer_relevancy': 0.9530, 'context_precision': 0.5000, 'context_recall': 0.2500, 'harmfulness': 0.0000}
