In [0]:
%pip install -Uq "protobuf>=4.21.6"  qdrant-client tiktoken "arize-phoenix[evals,embeddings]" "openai>=1" openinference-instrumentation-langchain litellm

In [0]:

# %pip uninstall -y langchain langchain-core langchain-openai langchain-community langchain-groq langchain-text-splitters openinference-instrumentation-langchain

%pip install --no-cache-dir "langchain>=0.1.0,<0.4.0"
%pip install --no-cache-dir "langchain-core>=0.1.0,<0.4.0"
%pip install --no-cache-dir "langchain-openai>=0.0.2"
%pip install --no-cache-dir "langchain-community>=0.0.10"
%pip install --no-cache-dir "langchain-groq>=0.2.0"
%pip install --no-cache-dir "langchain-text-splitters>=0.0.1"
%pip install --no-cache-dir "openinference-instrumentation-langchain>=0.1.29"

dbutils.library.restartPython()

# Verify installations (run after restart)
%pip freeze | grep langchain


In [0]:

from langchain_core.messages.ai import InputTokenDetails
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from openinference.instrumentation.langchain import LangChainInstrumentor

# Print package versions to verify
import pkg_resources
packages = [
    'langchain',
    'langchain-community',
    'langchain-core',
    'langchain-groq',
    'langchain-openai',
    'langchain-text-splitters',
    'openinference-instrumentation-langchain'
]

for package in packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package}: Not found")

langchain: 0.3.10
langchain-community: 0.3.10
langchain-core: 0.3.22
langchain-groq: 0.2.1
langchain-openai: 0.2.11
langchain-text-splitters: 0.3.2
openinference-instrumentation-langchain: 0.1.29


In [0]:

import os
import json
import tempfile
from getpass import getpass
from urllib.request import urlretrieve

import nest_asyncio
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
from langchain.callbacks import StdOutCallbackHandler

from langchain.chains import RetrievalQA, LLMChain
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

from openinference.instrumentation.langchain import LangChainInstrumentor


import phoenix as px
from phoenix.otel import register

In [0]:
# Configuration and Initialization
nest_asyncio.apply()
pd.set_option("display.max_colwidth", None)

# Configure Groq API Key using dbutils
try:
    groq_api_key = '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'
    os.environ["GROQ_API_KEY"] = groq_api_key
    os.environ["PHOENIX_PROJECT_NAME"] = "Phoenix_Capabilities_Testing"
    os.environ["AZURE_API_KEY"] = "38a6b22e0e4f43828877d844399faf4d"
    os.environ["AZURE_API_BASE"] = "https://ai-abhinavkatiyarai793972137108.openai.azure.com" 
    os.environ["AZURE_API_VERSION"] = "2024-08-01-preview"

except Exception as e:
    print("Error accessing Groq API key from secrets. Please add it to the 'llm-keys' scope with key 'groq-api-key'")
    raise e



In [0]:
# Configure embeddings using SentenceTransformers
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)


def load_documents(directory_path):
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):  
            file_path = os.path.join(directory_path, filename)
            loader = TextLoader(file_path)
            documents.extend(loader.load())
    return documents

documents = load_documents('/Workspace/Users/abhinav.katiyar@spaceinventive.com/data/')

# Create text splitter for smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function=len,
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)

# Process documents into chunks
all_chunks = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)
    valid_chunks = [chunk for chunk in chunks if len(chunk) > 100]
    all_chunks.extend(valid_chunks)

In [0]:

# Build vector store
qdrant = Qdrant.from_texts(
    all_chunks,
    embeddings,
    location=":memory:",
    collection_name="my_documents",
)

# Configure retriever
retriever = qdrant.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 2,
        "fetch_k": 3
    }
)

# Configure Groq for question generation
question_llm = ChatGroq(
    model_name="mixtral-8x7b-32768",
    temperature=0.0,
    max_tokens=512,
    streaming=False
)

# Configure Groq for QA
qa_llm = ChatGroq(
    model_name="mixtral-8x7b-32768",
    temperature=0.1,
    max_tokens=1024,
    streaming=False
)

In [0]:

# Question Generation Template
# generate_questions_template = """Create exactly 3 questions based on this text. Only return a JSON object.

# TEXT TO ANALYZE:
# {text}

# RESPONSE FORMAT:
# {{
# "question_1": "Write your first question here",
# "question_2": "Write your second question here",
# "question_3": "Write your third question here"
# }}

# IMPORTANT: Only return the JSON object, no additional text."""

generate_questions_template = """\
Context information is below.

---------------------
{text}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
3 questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."

Output the questions in JSON format with the keys question_1, question_2, question_3.
"""

# QA Template
qa_prompt_template = """Answer the following question based on the given context. Be concise.

Context: {context}

Question: {question}

Answer:"""

# Create chains
question_chain = LLMChain(
    llm=question_llm,
    prompt=PromptTemplate(
        template=generate_questions_template,
        input_variables=["text"]
    )
)

qa_chain = RetrievalQA.from_chain_type(
    llm=qa_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=qa_prompt_template,
            input_variables=["context", "question"]
        )
    }
)

  question_chain = LLMChain(


In [0]:
session = px.active_session()
print(session)

None


In [0]:
tracer_provider = register()
LangChainInstrumentor().instrument(skip_dep_check=True, tracer_provider=tracer_provider)

# Launch Phoenix
session = px.launch_app()
print(f"Phoenix UI available at: {session.url}")

In [0]:
question = "What information is available in the context?"

response = qa_chain({"query": question})
print(response['query'])
print(response['result'])


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel
"


What information is available in the context?
The context provides information about a regulation related to autonomous actions affecting certain systems (Category A systems) in the context of AI. It mentions that operators must maintain comprehensive documentation of decision paths for these autonomous actions. However, it does not provide information about philosophical questions, regulatory compliance across multiple jurisdictions, or impact on market dynamics and competitive positioning.


Trace(request_id=tr-0b4e802958a843c681c5bf1d5b81339f)

In [0]:
spans_df.columns

Index(['name', 'span_kind', 'parent_id', 'start_time', 'end_time',
       'status_code', 'status_message', 'events', 'context.span_id',
       'context.trace_id', 'attributes.openinference.span.kind',
       'attributes.output.value', 'attributes.metadata',
       'attributes.input.value', 'attributes.output.mime_type',
       'attributes.retrieval.documents',
       'attributes.llm.token_count.completion', 'attributes.input.mime_type',
       'attributes.llm.token_count.total', 'attributes.llm.token_count.prompt',
       'attributes.llm.input_messages', 'attributes.llm.output_messages',
       'attributes.llm.invocation_parameters'],
      dtype='object')

In [0]:
spans_df = px.Client().get_spans_dataframe()
spans_df[["name", "span_kind", "attributes.input.value", "attributes.retrieval.documents","attributes.llm.output_messages"]].head(2)



Unnamed: 0_level_0,name,span_kind,attributes.input.value,attributes.retrieval.documents,attributes.llm.output_messages
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e6e8a53c4fc30767,VectorStoreRetriever,RETRIEVER,What information is available in the context?,"[{'document.content': '. Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).', 'document.metadata': {'_id': '72cd6582f98344139138a689b4c4a13f', '_collection_name': 'my_documents'}}, {'document.content': '2. Philosophical questions about AI agency and responsibility 3. Regulatory compliance across multiple jurisdictions 4. Impact on market dynamics and competitive positioning', 'document.metadata': {'_id': 'ddef3f0b37484073a21ca25eb25e4c5c', '_collection_name': 'my_documents'}}]",
d0759506f03a046e,ChatGroq,LLM,"{""messages"": [[{""lc"": 1, ""type"": ""constructor"", ""id"": [""langchain"", ""schema"", ""messages"", ""HumanMessage""], ""kwargs"": {""content"": ""Answer the following question based on the given context. Be concise.\n\nContext: . Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).\n\n2. Philosophical questions about AI agency and responsibility\n3. Regulatory compliance across multiple jurisdictions\n4. Impact on market dynamics and competitive positioning\n\nQuestion: What information is available in the context?\n\nAnswer:"", ""type"": ""human""}}]]}",,"[{'message.role': 'assistant', 'message.content': 'The context provides information about a regulation related to autonomous actions affecting certain systems (Category A systems) in the context of AI. It mentions that operators must maintain comprehensive documentation of decision paths for these autonomous actions. However, it does not provide information about philosophical questions, regulatory compliance across multiple jurisdictions, or impact on market dynamics and competitive positioning.'}]"


In [0]:
sampled_chunks = pd.DataFrame({"text": all_chunks})
sample_size = min(4, len(sampled_chunks))
sampled_chunks = sampled_chunks.sample(n=sample_size, random_state=42)

def clean_and_parse_response(response_text):
    """Clean and parse the response text into valid JSON."""
    try:
        cleaned = response_text.strip()
        start = cleaned.find('{')
        end = cleaned.rfind('}')
        
        if start != -1 and end != -1:
            cleaned = cleaned[start:end+1]
        
        result = json.loads(cleaned)
        
        required_keys = ['question_1', 'question_2', 'question_3']
        if not all(key in result for key in required_keys):
            raise ValueError("Missing required question keys")
            
        return result
    except Exception as e:
        print(f"Parsing error: {str(e)}")
        raise

# Generate questions
questions = []
for idx, row in sampled_chunks.iterrows():
    try:
        chunk_text = row['text'][:500]  # Limit chunk size
        
        response = question_chain.invoke({"text": chunk_text})
        parsed = clean_and_parse_response(response['text'])
        
        questions.append({
            "text": row['text'],  # Keep the original text chunk
            "question_1": parsed['question_1'],
            "question_2": parsed['question_2'],
            "question_3": parsed['question_3']
        })
        
    except Exception as e:
        print(f"Failed to process chunk {idx + 1}: {str(e)}")
        continue

# Create questions dataframe
questions_df = pd.DataFrame(questions)


# LLM_Generate by Phoneix (provide dby phoenix to generate questions)
import json

from phoenix.evals import OpenAIModel, llm_generate


def output_parser(response: str, index: int):
    try:
        return json.loads(response)
    except json.JSONDecodeError as e:
        return {"__error__": str(e)}


questions_df = llm_generate(
    dataframe=document_chunks_df,
    template=generate_questions_template,
    model=OpenAIModel(model="gpt-3.5-turbo"),
    output_parser=output_parser,
    concurrency=20,
)

In [0]:
import json
from phoenix.evals import LiteLLMModel, llm_generate

sampled_chunks = pd.DataFrame({"text": all_chunks})
sample_size = min(5, len(sampled_chunks))
sampled_chunks = sampled_chunks.sample(n=sample_size, random_state=42)


def output_parser(response: str, index: int):
    try:
        return json.loads(response)
    except json.JSONDecodeError as e:
        return {"__error__": str(e)}

questions_df = llm_generate(
    dataframe=sampled_chunks,
    template=generate_questions_template,
    model=LiteLLMModel(model="azure/gpt-35-turbo"),
    output_parser=output_parser,
    concurrency=20,
)

In [0]:
questions_df

Unnamed: 0,question_1,question_2,question_3
9,What is the participatory anthropic principle proposed by Wheeler?,"According to the context information, what is the implication of consciousness being primary?",Does objective reality exist independent of observation? Explain your answer with reference to the context information provided.
25,What is the 'substantial factor' test and which courts have adopted it?,What is the traditional causation standard for algorithmic influence assessment and how has it been modified?,What is the Model Autonomous Systems Code and how does it relate to the 'substantial factor' test?
8,What is the perspective proposed by some theorists regarding consciousness and reality?,How does the perspective of consciousness being fundamental to reality challenge materialist and dualist frameworks?,What is the difference between the emergent and fundamental perspectives of consciousness in relation to reality?
21,What is the requirement for operators regarding documentation of decision paths for autonomous actions affecting Category A systems?,What is the definition of Category A systems as per Appendix II-B?,What is the subsection that specifies the requirement for operators to maintain comprehensive documentation of decision paths for autonomous actions affecting Category A systems?
0,What is the Standard Model's conceptualization of quantum chromodynamics (QCD)?,What are strong interactions in the context of quantum chromodynamics (QCD)?,What is the role of quarks and gluons in quantum chromodynamics (QCD)?


In [0]:
# Construct a dataframe of the questions and the document chunks
questions_with_document_chunk_df = pd.concat([questions_df, sampled_chunks], axis=1)
questions_with_document_chunk_df = questions_with_document_chunk_df.melt(
    id_vars=["text"], value_name="question"
).drop("variable", axis=1)
# If the above step was interrupted, there might be questions missing. Let's run this to clean up the dataframe.
questions_with_document_chunk_df = questions_with_document_chunk_df[
    questions_with_document_chunk_df["question"].notnull()
]

In [0]:
questions_with_document_chunk_df

Unnamed: 0,text,question
0,"Consider the implications: if consciousness is primary, does objective reality exist independent of observation? This recalls Wheeler's participatory anthropic principle, suggesting that observers are necessary for the actualization of potential states",What is the participatory anthropic principle proposed by Wheeler?
1,". Some courts have adopted the ""substantial factor"" test outlined in the Model Autonomous Systems Code, while others maintain traditional ""but for"" causation standards with modifications for algorithmic influence assessment.""""""",What is the 'substantial factor' test and which courts have adopted it?
2,. Some theorists propose that consciousness might be fundamental to reality rather than emergent from it - a perspective that challenges both materialist and dualist frameworks.,What is the perspective proposed by some theorists regarding consciousness and reality?
3,". Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).",What is the requirement for operators regarding documentation of decision paths for autonomous actions affecting Category A systems?
4,The Standard Model's conceptualization of quantum chromodynamics (QCD) represents a sophisticated framework for understanding strong interactions between quarks and gluons,What is the Standard Model's conceptualization of quantum chromodynamics (QCD)?
5,"Consider the implications: if consciousness is primary, does objective reality exist independent of observation? This recalls Wheeler's participatory anthropic principle, suggesting that observers are necessary for the actualization of potential states","According to the context information, what is the implication of consciousness being primary?"
6,". Some courts have adopted the ""substantial factor"" test outlined in the Model Autonomous Systems Code, while others maintain traditional ""but for"" causation standards with modifications for algorithmic influence assessment.""""""",What is the traditional causation standard for algorithmic influence assessment and how has it been modified?
7,. Some theorists propose that consciousness might be fundamental to reality rather than emergent from it - a perspective that challenges both materialist and dualist frameworks.,How does the perspective of consciousness being fundamental to reality challenge materialist and dualist frameworks?
8,". Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).",What is the definition of Category A systems as per Appendix II-B?
9,The Standard Model's conceptualization of quantum chromodynamics (QCD) represents a sophisticated framework for understanding strong interactions between quarks and gluons,What are strong interactions in the context of quantum chromodynamics (QCD)?


In [0]:
# questions_df
# questions_with_document_chunk_df = questions_df.melt(
#     id_vars=["text"], value_name="question"
# ).drop("variable", axis=1)
# # If the above step was interrupted, there might be questions missing. Let's run this to clean up the dataframe.
# questions_with_document_chunk_df = questions_with_document_chunk_df[
#     questions_with_document_chunk_df["question"].notnull()
# ]
# questions_with_document_chunk_df

com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:138)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:714)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:432)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:432)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:458)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:537)
	at com.data

In [0]:
questions_with_document_chunk_df.shape

(15, 2)

In [0]:
def generate_qa_pairs(questions_df, retriever, qa_chain, max_context_length=1000, docs_per_question=2):
    """
    Generate question-answer pairs with retrieved context.
    
    Args:
        questions_df (pd.DataFrame): DataFrame containing questions with 'text' and 'question' columns
        retriever: Document retriever instance
        qa_chain: QA chain instance
        max_context_length (int): Maximum length for context
        docs_per_question (int): Number of documents to use for context
    
    Returns:
        pd.DataFrame: DataFrame containing QA pairs with context
    """
    qa_pairs = []
    total_questions = len(questions_df)
    
    if questions_df.empty:
        print("No questions to process!")
        return pd.DataFrame()
    
    for idx, row in questions_df.iterrows():
        try:
            question = row['question']
            print(f"\nProcessing question {idx + 1}/{total_questions}:")
            print(f"Question: {question}")
            
            # Retrieve relevant documents
            relevant_docs = retriever.get_relevant_documents(question)
            context = " ".join([doc.page_content for doc in relevant_docs[:docs_per_question]])
            
            # Truncate context if needed
            if len(context) > max_context_length:
                context = context[:max_context_length] + "..."
            
            # Generate answer - using 'query' instead of 'question'
            response = qa_chain({
                "query": question
            })
            
            # Store results
            qa_pair = {
                "text": row['text'],
                "question": question,
                "answer": response["result"],
                "context": context,
                "context_length": len(context)
            }
            
            print(f"Answer: {qa_pair['answer'][:100]}...")  # Print first 100 chars of answer
            qa_pairs.append(qa_pair)
            
        except Exception as e:
            print(f"Error processing question {idx + 1}: {str(e)}")
            # Add error entry to maintain data consistency
            qa_pairs.append({
                "text": row['text'],
                "question": question,
                "answer": f"Error generating answer: {str(e)}",
                "context": "",
                "context_length": 0
            })
            continue
        
        # Print progress
        if (idx + 1) % 5 == 0:
            print(f"\nCompleted {idx + 1}/{total_questions} questions")
    
    # Create DataFrame
    qa_df = pd.DataFrame(qa_pairs)
    
    # Add quality metrics
    qa_df['answer_length'] = qa_df['answer'].str.len()
    qa_df['question_length'] = qa_df['question'].str.len()
    
    print(f"\nProcessing complete! Generated {len(qa_df)} QA pairs")
    
    return qa_df

# Generate QA pairs
qa_df = generate_qa_pairs(
    questions_df=questions_with_document_chunk_df,
    retriever=retriever,
    qa_chain=qa_chain,
    max_context_length=1000,
    docs_per_question=2
)

In [0]:
from phoenix.session.evaluation import get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.Client())
retrieved_documents_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e6e8a53c4fc30767,0,c4493a2c76d3cfdf1f2e7e43fdbc04ec,What information is available in the context?,". Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B)."
e6e8a53c4fc30767,1,c4493a2c76d3cfdf1f2e7e43fdbc04ec,What information is available in the context?,2. Philosophical questions about AI agency and responsibility\n3. Regulatory compliance across multiple jurisdictions\n4. Impact on market dynamics and competitive positioning
950905a084b6365b,0,e83918b83b3d9d19d8884b8abc9ba4c9,What is the participatory anthropic principle proposed by Wheeler?,"Consider the implications: if consciousness is primary, does objective reality exist independent of observation? This recalls Wheeler's participatory anthropic principle, suggesting that observers are necessary for the actualization of potential states"
950905a084b6365b,1,e83918b83b3d9d19d8884b8abc9ba4c9,What is the participatory anthropic principle proposed by Wheeler?,". Yet this illusion appears necessary for practical functioning, creating a paradox where we must simultaneously accept and reject our apparent individuality."""""""
2c055f2dee5930a2,0,cdafdddcaa58e22e9c3ca97c5dd3dcaf,What is the participatory anthropic principle proposed by Wheeler?,"Consider the implications: if consciousness is primary, does objective reality exist independent of observation? This recalls Wheeler's participatory anthropic principle, suggesting that observers are necessary for the actualization of potential states"
...,...,...,...,...
1011046270e48ea3,1,ff84746664c13153e2aeeb2c34bc7af0,What is the subsection that specifies the requirement for operators to maintain comprehensive documentation of decision paths for autonomous actions affecting Category A systems?,"However, the interpretation of ""reasonable foreseeability"" under Article 12.4 remains contested, particularly in cases where multiple AI systems interact through standardized APIs. The precedent established in TechCorp v"
c1bc0ee3b32d457d,0,ad4726fbd2e6dcba3775cb3ce6097ba9,What is the role of quarks and gluons in quantum chromodynamics (QCD)?,The Standard Model's conceptualization of quantum chromodynamics (QCD) represents a sophisticated framework for understanding strong interactions between quarks and gluons
c1bc0ee3b32d457d,1,ad4726fbd2e6dcba3775cb3ce6097ba9,What is the role of quarks and gluons in quantum chromodynamics (QCD)?,"The Higgs mechanism, while separate from QCD, plays a crucial role through spontaneous symmetry breaking. This process generates masses for the W and Z bosons while leaving the photon massless"
065a5b74f19a8ee0,0,2be295e356d05d77174034ae056df0f3,What is the role of quarks and gluons in quantum chromodynamics (QCD)?,The Standard Model's conceptualization of quantum chromodynamics (QCD) represents a sophisticated framework for understanding strong interactions between quarks and gluons


In [0]:
import phoenix.evals

module_attrs = dir(phoenix.evals)

module_classes = [attr for attr in module_attrs if isinstance(getattr(phoenix.evals, attr), type)]
print("Classes available in phoenix.evals:")
for cls in module_classes:
    print(cls)

In [0]:
import os
import nest_asyncio
from phoenix.evals import (
    RelevanceEvaluator,
    run_evals,
    LiteLLMModel
)

nest_asyncio.apply()

if not (gemini_api_key := os.getenv("GEMINI_API_KEY")):
    gemini_api_key = input("Enter your Gemini API key: ")
os.environ["GEMINI_API_KEY"] = gemini_api_key

relevance_evaluator = RelevanceEvaluator(LiteLLMModel(
    model="gemini/gemini-pro"
))


retrieved_documents_relevance_df = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=retrieved_documents_df,
    provide_explanation=True,
    concurrency=5
)[0]

Enter your Gemini API key:  

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:714)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:432)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:432)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
import os
import nest_asyncio
from phoenix.evals import (
    RelevanceEvaluator,
    run_evals,
    LiteLLMModel
)

nest_asyncio.apply()

relevance_evaluator = RelevanceEvaluator(LiteLLMModel(
    model="azure/gpt-35-turbo"
))

retrieved_documents_relevance_df = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=retrieved_documents_df,
    provide_explanation=True,
    concurrency=5
)[0]

In [0]:
retrieved_documents_relevance_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,score,explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e6e8a53c4fc30767,0,unrelated,0,"The reference text mentions that operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems. However, it does not provide any specific information about what information is available in the context. Therefore, the reference text is unrelated to the question.\nLABEL: unrelated"
e6e8a53c4fc30767,1,unrelated,0,"The reference text lists three topics: philosophical questions about AI agency and responsibility, regulatory compliance across multiple jurisdictions, and impact on market dynamics and competitive positioning. None of these topics directly answer the question of what information is available in the context. Therefore, the label is ""unrelated"". \n\nLABEL: unrelated"
950905a084b6365b,0,relevant,1,"The reference text mentions Wheeler's participatory anthropic principle, which suggests that observers are necessary for the actualization of potential states. The question asks about this principle proposed by Wheeler. Therefore, the reference text is directly related to the question and contains information that can help answer it.\nLABEL: relevant"
950905a084b6365b,1,unrelated,0,"The reference text does not contain any information about the participatory anthropic principle proposed by Wheeler. It discusses an illusion related to individuality, but this is not relevant to the question.\nLABEL: unrelated"
2c055f2dee5930a2,0,relevant,1,"The reference text mentions Wheeler's participatory anthropic principle, which suggests that observers are necessary for the actualization of potential states. The question asks about this principle proposed by Wheeler. Therefore, the reference text is directly related to the question and contains information that can help answer it.\nLABEL: relevant"


In [0]:
documents_with_relevance_df = pd.concat(
    [retrieved_documents_df, retrieved_documents_relevance_df.add_prefix("eval_")], axis=1
)


In [0]:
documents_with_relevance_df.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,eval_label,eval_score,eval_explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
e6e8a53c4fc30767,0,c4493a2c76d3cfdf1f2e7e43fdbc04ec,What information is available in the context?,". Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).",unrelated,0,"The reference text mentions that operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems. However, it does not provide any specific information about what information is available in the context. Therefore, the reference text is unrelated to the question.\nLABEL: unrelated"
e6e8a53c4fc30767,1,c4493a2c76d3cfdf1f2e7e43fdbc04ec,What information is available in the context?,2. Philosophical questions about AI agency and responsibility\n3. Regulatory compliance across multiple jurisdictions\n4. Impact on market dynamics and competitive positioning,unrelated,0,"The reference text lists three topics: philosophical questions about AI agency and responsibility, regulatory compliance across multiple jurisdictions, and impact on market dynamics and competitive positioning. None of these topics directly answer the question of what information is available in the context. Therefore, the label is ""unrelated"". \n\nLABEL: unrelated"
950905a084b6365b,0,e83918b83b3d9d19d8884b8abc9ba4c9,What is the participatory anthropic principle proposed by Wheeler?,"Consider the implications: if consciousness is primary, does objective reality exist independent of observation? This recalls Wheeler's participatory anthropic principle, suggesting that observers are necessary for the actualization of potential states",relevant,1,"The reference text mentions Wheeler's participatory anthropic principle, which suggests that observers are necessary for the actualization of potential states. The question asks about this principle proposed by Wheeler. Therefore, the reference text is directly related to the question and contains information that can help answer it.\nLABEL: relevant"
950905a084b6365b,1,e83918b83b3d9d19d8884b8abc9ba4c9,What is the participatory anthropic principle proposed by Wheeler?,". Yet this illusion appears necessary for practical functioning, creating a paradox where we must simultaneously accept and reject our apparent individuality.""""""",unrelated,0,"The reference text does not contain any information about the participatory anthropic principle proposed by Wheeler. It discusses an illusion related to individuality, but this is not relevant to the question.\nLABEL: unrelated"


In [0]:
import numpy as np
from sklearn.metrics import ndcg_score


def _compute_ndcg(df: pd.DataFrame, k: int):
    """Compute NDCG@k in the presence of missing values"""
    n = max(2, len(df))
    eval_scores = np.zeros(n)
    doc_scores = np.zeros(n)
    eval_scores[: len(df)] = df.eval_score
    try:
        return ndcg_score([eval_scores], k=k)
    except ValueError:
        return np.nan


ndcg_at_2 = pd.DataFrame(
    {"score": documents_with_relevance_df.groupby("context.span_id").apply(_compute_ndcg, k=2)}
)

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-4402110491910889>, line 18[0m
[1;32m     13[0m     [38;5;28;01mexcept[39;00m [38;5;167;01mValueError[39;00m:
[1;32m     14[0m         [38;5;28;01mreturn[39;00m np[38;5;241m.[39mnan
[1;32m     17[0m ndcg_at_2 [38;5;241m=[39m pd[38;5;241m.[39mDataFrame(
[0;32m---> 18[0m     {[38;5;124m"[39m[38;5;124mscore[39m[38;5;124m"[39m: documents_with_relevance_df[38;5;241m.[39mgroupby([38;5;124m"[39m[38;5;124mcontext.span_id[39m[38;5;124m"[39m)[38;5;241m.[39mapply(_compute_ndcg, k[38;5;241m=[39m[38;5;241m2[39m)}
[1;32m     19[0m )

File [0;32m/databricks/python/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1588[0m, in [0;36mGroupBy.apply[0;34m(self, func, *args, **kwargs)[0m
[1;32m   1580[0m             new_msg [38;5;241m=[39m (
[1;32m   158

In [0]:
precision_at_2 = pd.DataFrame(
    {
        "score": documents_with_relevance_df.groupby("context.span_id").apply(
            lambda x: x.eval_score[:2].sum(skipna=False) / 2
        )
    }
)

In [0]:
precision_at_2

Unnamed: 0_level_0,score
context.span_id,Unnamed: 1_level_1
00b0644a6aa222f6,0.5
02eae8211d552185,0.5
065a5b74f19a8ee0,0.5
1011046270e48ea3,0.5
1af0005417e552dd,0.5
2259fbec7e399107,0.5
2a3d53e0de65c32e,0.5
2c055f2dee5930a2,0.5
2f3bdaccf2465649,1.0
3613029ba02c1609,1.0


In [0]:
hit = pd.DataFrame(
    {
        "hit": documents_with_relevance_df.groupby("context.span_id").apply(
            lambda x: x.eval_score[:2].sum(skipna=False) > 0
        )
    }
)

In [0]:
retrievals_df = px.Client().get_spans_dataframe(
    "span_kind == 'RETRIEVER' and input.value is not None"
)
rag_evaluation_dataframe = pd.concat(
    [
        retrievals_df["attributes.input.value"],
        precision_at_2.add_prefix("precision@2_"),
        hit,
    ],
    axis=1,
)
rag_evaluation_dataframe

In [0]:
# Aggregate the scores across the retrievals
results = rag_evaluation_dataframe.mean(numeric_only=True)
results

precision@2_score    0.677419
hit                  0.967742
dtype: float64

In [0]:
from phoenix.trace import DocumentEvaluations, SpanEvaluations

px.Client().log_evaluations(
    SpanEvaluations(dataframe=precision_at_2, eval_name="precision@2"),
    DocumentEvaluations(dataframe=retrieved_documents_relevance_df, eval_name="relevance"),
)

In [0]:
from phoenix.session.evaluation import get_qa_with_reference

qa_with_reference_df = get_qa_with_reference(px.Client())
qa_with_reference_df.head(1)

Unnamed: 0_level_0,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ca7aa2a22839c2b7,What information is available in the context?,"{""result"": ""The context provides information about a regulation related to autonomous actions affecting certain systems (Category A systems) in the context of AI. It mentions that operators must maintain comprehensive documentation of decision paths for these autonomous actions. However, it does not provide information about philosophical questions, regulatory compliance across multiple jurisdictions, or impact on market dynamics and competitive positioning."", ""source_documents"": [""page_content='. Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).' metadata={'_id': '72cd6582f98344139138a689b4c4a13f', '_collection_name': 'my_documents'}"", ""page_content='2. Philosophical questions about AI agency and responsibility\n3. Regulatory compliance across multiple jurisdictions\n4. Impact on market dynamics and competitive positioning' metadata={'_id': 'ddef3f0b37484073a21ca25eb25e4c5c', '_collection_name': 'my_documents'}""]}",". Pursuant to subsection (c)(2), operators must maintain comprehensive documentation of decision paths for any autonomous actions affecting Category A systems (as defined in Appendix II-B).\n\n2. Philosophical questions about AI agency and responsibility\n3. Regulatory compliance across multiple jurisdictions\n4. Impact on market dynamics and competitive positioning"


In [0]:
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    LiteLLMModel,
    QAEvaluator,
    run_evals,
)

qa_evaluator = QAEvaluator(LiteLLMModel(
    model="azure/gpt-35-turbo"
))
hallucination_evaluator = HallucinationEvaluator(LiteLLMModel(
    model="azure/gpt-35-turbo"
))

qa_correctness_eval_df, hallucination_eval_df = run_evals(
    evaluators=[qa_evaluator, hallucination_evaluator],
    dataframe=qa_with_reference_df,
    provide_explanation=True,
    concurrency=20,
)

In [0]:
qa_correctness_eval_df.head(5)

Unnamed: 0_level_0,label,score,explanation
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
67450fc84b588cce,correct,1,"The answer provided does correctly answer the question. The reference text states that the issue with the physical diploma was that the quotes had been included, making them read as scare-quotes. The answer directly quotes this information and provides the context in which the issue was discovered. Therefore, the answer is ""correct"". \nLABEL: ""correct"""
267f5fb10d21f760,correct,1,"The answer correctly states that the issue with the quotes on the physical diploma was that they made the words appear as scare-quotes, which the person found bothersome at the time. This is supported by the reference text, which states ""When I got the actual physical diploma, I was dismayed to find that the quotes had been included, which made them read as scare-quotes. At the time this bothered me, but now it seems amusingly accurate, for reasons I was about to discover."" Therefore, the answer is correct.\nLABEL: ""correct"""
b3e18e69029b958f,incorrect,0,"The question asks for the purpose of launching privately before launching publicly. The reference text states that the purpose was to recruit an initial set of users and ensure they had decent-looking stores. The answer provides two separate quotes from the reference text, one stating the purpose of recruiting an initial set of users and the other stating the benefit of having colleagues who understand the problems faced by founders. While the second quote may be related to the benefits of launching privately, it does not directly answer the question. Therefore, the answer is partially correct but not fully correct. \nLABEL: ""incorrect"""
7ca7b83ec02db4e7,correct,1,"The answer correctly states that the purpose of launching privately before launching publicly was to recruit an initial set of users and ensure they had decent-looking stores. Additionally, it correctly identifies that one of the benefits of launching privately was to address the isolation faced by founders by providing colleagues who understood their problems. Therefore, the answer fully and accurately answers the question. \nLABEL: ""correct"""
8f8f3de66bfbb47c,incorrect,0,"The answer does not provide any information about the author's opinion on the painting method described in the context. Therefore, the answer is incorrect. \nLABEL: ""incorrect"""


In [0]:
hallucination_eval_df.head(5)

Unnamed: 0_level_0,label,score,explanation
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
67450fc84b588cce,hallucinated,1.0,"The answer provided does not address the query of ""What was the issue with the quotes on the physical diploma?"" and instead provides two separate sentences from the reference text. Therefore, the answer is hallucinated. \nLABEL: hallucinated"
267f5fb10d21f760,factual,0.0,"The answer correctly states that the issue with the quotes on the physical diploma was that they made the words appear as scare-quotes, which the person found bothersome at the time. This information is directly stated in the reference text. Therefore, the answer is factual. \n\nLABEL: factual"
b3e18e69029b958f,factual,0.0,"The answer provided contains two parts of the reference text that directly answer the query. The first part explains that the purpose of launching privately was to recruit an initial set of users and ensure they had decent-looking stores before launching publicly. The second part explains that launching privately also solved the problem of isolation for founders by providing colleagues who understood the problems they were facing. Therefore, the answer is factual and provides accurate information based on the reference text. \n\nLABEL: factual"
7ca7b83ec02db4e7,factual,0.0,"The answer accurately reflects the information provided in the reference text. It states that the purpose of launching privately before launching publicly was to recruit an initial set of users, ensure they had decent-looking stores, and address the isolation faced by founders by providing colleagues who understood their problems. This information is supported by the reference text, which states that the company had to launch privately to recruit an initial set of users and ensure they had decent-looking stores, and that launching privately also addressed the isolation faced by founders by providing colleagues who understood their problems. Therefore, the answer is factual. \nLABEL: factual"
8f8f3de66bfbb47c,hallucinated,1.0,"The query asks for the author's opinion on the painting method described in the context. The reference text mentions painting and painting students, but does not provide any specific information about a painting method or the author's opinion on it. The answer provided is a set of two documents that do not directly address the query or provide any information about the author's opinion on a painting method. Therefore, the answer is hallucinated as it does not provide factual information related to the query and reference text. \n\nLABEL: hallucinated"


In [0]:
qa_correctness_eval_df.mean(numeric_only=True)

score    0.733333
dtype: float64

In [0]:
hallucination_eval_df.mean(numeric_only=True)

score    0.233333
dtype: float64

In [0]:
from phoenix.trace import SpanEvaluations

px.Client().log_evaluations(
    SpanEvaluations(dataframe=qa_correctness_eval_df, eval_name="Q&A Correctness"),
    SpanEvaluations(dataframe=hallucination_eval_df, eval_name="Hallucination"),
)