### Evaluating using Phoenix Arize - Rag evaluation

In [1]:
# Install phoenix as well as the evals subpackage
# !pip install 'arize-phoenix[evals]' ipython matplotlib openai pycm scikit-learn

In [2]:
from phoenix.evals import (
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

# Download the benchmark golden dataset
df = download_benchmark_dataset(
    task="binary-relevance-classification", dataset_name="wiki_qa-train"
)


# Sample and re-name the columns to match the template
df = df.sample(10)
benchmark = df
df = df.rename(
    columns={
        "query_text": "input",
        "document_text": "reference",
    },
)
model = OpenAIModel(
    model="gpt-4",
    temperature=0.0,
)
rails =list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
df[["eval_relevance"]] = llm_classify(df, model, RAG_RELEVANCY_PROMPT_TEMPLATE, rails)
#Golden dataset has True/False map to -> "irrelevant" / "relevant"
#we can then scikit compare to output of template - same format
y_true = df["relevant"].map({True: "relevant", False: "irrelevant"})
y_pred = df["eval_relevance"]

# Compute Per-Class Precision, Recall, F1 Score, Support
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred)

  from .autonotebook import tqdm as notebook_tqdm
🐌!! If running llm_classify inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.
llm_classify |██████████| 10/10 (100.0%) | ⏳ 00:08<00:00 |  1.15it/s
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
precision, recall, f1, support 

(array([0. , 0.6, 0. ]),
 array([0.  , 0.75, 0.  ]),
 array([0.        , 0.66666667, 0.        ]),
 array([6, 4, 0]))

In [10]:
df #[df["query_id"]=='Q2136']

Unnamed: 0,query_id,input,document_title,reference,document_text_with_emphasis,relevant,eval_relevance
508,Q1646,what is a biwa pearl,Pearl,A pearl is a hard object produced within the s...,A pearl is a hard object produced within the s...,False,unrelated
678,Q1869,where does the word woman comes from,Woman,"A woman (), pl: women () is a female human . T...","A woman (), pl: women () is a female human . T...",False,unrelated
303,Q1385,who has renee zellweger dated,RenÃ©e Zellweger,"Renée Kathleen Zellweger (born April 25, 1969)...","Renée Kathleen Zellweger (born April 25, 1969)...",False,unrelated
923,Q2181,what is modified agi,Adjusted Gross Income,"For United States individual income tax, adjus...","For United States individual income tax, adjus...",False,unrelated
1453,Q2874,who is a utilization manager,Utilization management,Utilization management is the evaluation of th...,Utilization management is the evaluation of th...,False,relevant
147,Q1186,when did secretariat win,Secretariat (horse),"Secretariat (March 30, 1970 – October 4, 1989)...","SECRETARIAT (MARCH 30, 1970 – OCTOBER 4, 1989)...",True,relevant
1439,Q2857,What is hydrogen in,Hydrogen,Hydrogen is a chemical element with symbol H a...,Hydrogen is a chemical element with symbol H a...,True,unrelated
791,Q2023,What is the name of the main male character in...,The Last Song (film),The Last Song is a 2010 American coming of age...,The Last Song is a 2010 American coming of age...,False,relevant
896,Q2150,what is flour made from,Flour,Three different kinds of wheat and rye flour F...,Three different kinds of wheat and rye flour F...,True,relevant
1096,Q2405,where is modesto california,"Modesto, California","Modesto ( Spanish for ""modest""), officially th...","MODESTO ( SPANISH FOR ""MODEST""), OFFICIALLY TH...",True,relevant


In [9]:
benchmark

Unnamed: 0,query_id,query_text,document_title,document_text,document_text_with_emphasis,relevant
508,Q1646,what is a biwa pearl,Pearl,A pearl is a hard object produced within the s...,A pearl is a hard object produced within the s...,False
678,Q1869,where does the word woman comes from,Woman,"A woman (), pl: women () is a female human . T...","A woman (), pl: women () is a female human . T...",False
303,Q1385,who has renee zellweger dated,RenÃ©e Zellweger,"Renée Kathleen Zellweger (born April 25, 1969)...","Renée Kathleen Zellweger (born April 25, 1969)...",False
923,Q2181,what is modified agi,Adjusted Gross Income,"For United States individual income tax, adjus...","For United States individual income tax, adjus...",False
1453,Q2874,who is a utilization manager,Utilization management,Utilization management is the evaluation of th...,Utilization management is the evaluation of th...,False
147,Q1186,when did secretariat win,Secretariat (horse),"Secretariat (March 30, 1970 – October 4, 1989)...","SECRETARIAT (MARCH 30, 1970 – OCTOBER 4, 1989)...",True
1439,Q2857,What is hydrogen in,Hydrogen,Hydrogen is a chemical element with symbol H a...,Hydrogen is a chemical element with symbol H a...,True
791,Q2023,What is the name of the main male character in...,The Last Song (film),The Last Song is a 2010 American coming of age...,The Last Song is a 2010 American coming of age...,False
896,Q2150,what is flour made from,Flour,Three different kinds of wheat and rye flour F...,Three different kinds of wheat and rye flour F...,True
1096,Q2405,where is modesto california,"Modesto, California","Modesto ( Spanish for ""modest""), officially th...","MODESTO ( SPANISH FOR ""MODEST""), OFFICIALLY TH...",True
