# **The Negotiator: A Testbed for evaluating RAG strategies**
Hello! Welcome to Abraham Alappat's personal project- a QA system to help people conduct negotiations for sales and procurement! In this and other notebooks, we will use a test-bed RAG QA product to test various strategies to optimize rag.

In the longer run we will convert this test-bed into a productized ML pipeline that will be further iterated.

**Goal for this particular notebook:** Test which chunking strategy helps optimize ML performance, latency and throughput for a set of standardized 50 questions.  

**The Test-bed Product:** A Q/A tool that refers to best practices negotiation tactics for any given sales or procurement situation and does not contain contradictory information which is available in web-scale data. This notebook represents the MVP exploration of such a product and will be iterated on to test various aspects of .


---------------------------------
Note that this notebook follows the best tradition of software... re-use of code! I wanted a RAG based Q&A tool so adapted the Arize's Tutorials on Pinecone Search and Retrieval, Deep Learning's Prompt Engineering Course, and tracing.

Things out of scope given the time for this project:
1. Optimizing K, model temperature, model type, vector DB, system prompt, etc.

*Code sourced from elsewhere will be marked as "sourced from [source]" in code comments*






---------------------------------

# Step 1: Install Libraries and Dependences + Obtain Pinecone and OpenAI details
**Background:** Here are we installing the usual libraries, packages and dependencies + collecting info to allow for downstream functions.

**Intructions:** Run all cells and enter your OpenAI API Key, as well as your Pinecone API Key, environment and Index name to enable the full notebook to run! The Pinecone index should be set up in the same way as the Arize RAG tutorial.

**1.1 Installations and Imports**

In [None]:
%pip install cython numpy scikit-learn

In [3]:
# Sourced from Arize Pinecone Search and Retrieval Tutorial + Langchain Tutorial + DeepLearning Tutorials

%pip install langchain openai==0.28.1 pinecone-client python-dotenv pypdf cohere tiktoken arize-phoenix unstructured fastparquet pyarrow GitPython tqdm  --quiet






Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for hdbscan (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [40 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-310
      creating build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\flat.py -> build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\hdbscan_.py -> build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\plots.py -> build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\prediction.py -> build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\robust_single_linkage_.py -> build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\validity.py -> build\lib.win-amd64-cpython-310\hdbscan
      copying hdbscan\__init__.py -> build\lib.win-amd64-cpython-310\hdbscan
      creating build\lib.win-amd64-cpython-310\hdbscan\tests
      copying hdbscan\tests\test_flat.py

In [None]:
#Base tools
import os
import textwrap
from getpass import getpass
from typing import Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
import json
import time
from typing_extensions import dataclass_transform
from pandas.io import parquet
# from google.colab import drive
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)
from tqdm import tqdm
from contextlib import contextmanager
from git import Repo

#OpenAI and Langchain
import openai
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone


#Observational Tools and vector database
import phoenix as px
from phoenix.trace.langchain import OpenInferenceTracer, LangChainInstrumentor
import pinecone

pd.set_option("display.max_colwidth", None)

In [1]:
%pip list

Package                       Version
----------------------------- --------
asttokens                     2.4.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.5
certifi                       2023.5.7
charset-normalizer            3.2.0
colorama                      0.4.6
comm                          0.1.4
Cython                        3.0.6
debugpy                       1.6.7
decorator                     5.1.1
exceptiongroup                1.1.3
executing                     1.2.0
idna                          3.4
importlib-metadata            6.8.0
ipykernel                     6.26.0
ipython                       8.16.1
jedi                          0.19.1
joblib                        1.3.2
jupyter_client                8.5.0
jupyter_core                  5.4.0
matplotlib-inline             0.1.6
nest-asyncio                  1.5.8
numpy                         1.25.1
osmium                        3.6.0
packaging                     23.1
pandas             

**1.2. Open AI Login**

In [None]:
# Sourced from Arize Pinecone Search and Retrieval Tutorial
if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

**1.3. Vector DB Logins**

In [None]:
#pinecone login
pinecone_api_key = getpass(prompt="🔑 Enter your Pinecone API key: ")
pinecone_environment = getpass(prompt="set your Pinecone environment")
pinecone_index_name = getpass(prompt="set your Pinecone index name")

pinecone.init(api_key=pinecone_api_key, environment=pinecone_environment)

#Saving environment Variables
os.environ["YOUR_PINECONE_API_KEY"] = pinecone_api_key
os.environ["YOUR_PINECONE_INDEX_NAME"]= pinecone_index_name
os.environ["YOUR_PINECONE_ENVIRONMENT"]= pinecone_environment

**1.4. Experimental Variables and Output Paths**

In [None]:
#Experiment Variables: (You can change this to be more specific with Neptune and W&B and track there)
os.environ["CHUNK_TYPES"] = json.dumps(["RecursiveCharacterTextSplitter"])
os.environ["CHUNK_SIZES"] = json.dumps([512,400,300,256])
os.environ["CHUNK_OVERLAPS"] = json.dumps([20,10])

!mkdir parq
os.environ["OUTPUT_PARQUET_PATH"]= "parq/"
os.environ["DOC_PATH"]= "test/"

**1.5. Set up Observability Tools**

In this version we are setting up tracing for from Arize

In [None]:
# Launch Phoenix to enable tracing
session = px.launch_app()
tracer = OpenInferenceTracer()
LangChainInstrumentor(tracer).instrument()

**1.6. Clone Github Repo to help set up the right files for end users**

In [None]:
print(os.getcwd())

@contextmanager
def cwd(path):
    oldpwd = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(oldpwd)

In [None]:
repo_url = "https://github.com/abemdxb/theNegotiator.git"
repo_path = "/content/theNegotiator"

!git clone "https://github.com/abemdxb/theNegotiator.git"

with cwd(repo_path):
  !git status

print(os.getcwd())

# Step 2: Data Preparation

**Background:** In this section, I am doing Data prep - specifically:
1. Creating the contextual data in Vector Databases for RAG from three best practice textbooks/ articles for negotiations in sales, procurement and personal relationships.
2. Creating the ⏰ 50 ⏰ example user queries that we can use to evaluate the model!

**Instructions:** Run the following cells. Nothing else should be required. *Note: that for the user facing version of this notebook,  we will be importing the standard queries I developed earlier - this is to ensure that manual annotations done earlier are still usable. OpenAI's models can generate different queries over time.* *italicized text*


**2.1. Create a new Pinecone Index and delete old one if it's already there**

In [None]:
# create the pinecone index  - sourced from https://github.com/pinecone-io/examples/blob/master/docs/quick-tour/hello-pinecone.ipynb

#delete the old index to ensure we don't have duplicate chunks (needed if you have a free pinecone account- only one index allowed there)
if pinecone_index_name in pinecone.list_indexes():
    pinecone.delete_index(pinecone_index_name)
    pinecone.create_index(name=pinecone_index_name, dimension=1536, metric="cosine")

# wait for index to be ready before connecting
while not pinecone.describe_index(pinecone_index_name).status['ready']:
    time.sleep(1)

print("Index ready")

**2.2. Download pdf and markdown versions of negotiation textbooks**

In [None]:
print(os.getcwd())
!mkdir test
!curl -o test/paper1.pdf https://www.peaksellinginc.com/userfiles/25%20Most%20Difficult%20Negotiation%20Tactics.pdf
!curl -o test/paper2.pdf https://spada.uns.ac.id/pluginfile.php/238682/mod_resource/content/1/Roy%20J.%20Lewicki%2C%20Bruce%20Barry%2C%20David%20M.%20Saunders%20-%20Essentials%20of%20Negotiation-McGraw-Hill%20Education%20%282016%29.pdf
!curl -o test/paper3.pdf https://www.atlantis-press.com/article/125958466.pdf

In [None]:
# print(os.getcwd())
# !mkdir testmd
# !wget -O testmd/mdpaper1.md https://raw.githubusercontent.com/abemdxb/theNegotiator/main/test/25_Most_Difficult_Negotiation_Tactics.md
# !wget -O testmd/mdpaper2.md https://raw.githubusercontent.com/abemdxb/theNegotiator/main/test/Essentials_of_Negotiation.md
# !wget -O testmd/mdpaper3.md https://github.com/abemdxb/theNegotiator/blob/main/test/Negotiation_and_Romantic_Relationships.md

**2.3. Run Python script to load the Pinecone Index with vectors**

Note we use one namespace for each chunking strategy as specified by environment variables

In [None]:
!curl -o build_negotiation_pdf_index_langchain_pinecone.py https://raw.githubusercontent.com/abemdxb/theNegotiator/main/multiple_chunk_strategy.py
!python build_negotiation_pdf_index_langchain_pinecone.py --pinecone-api-key $YOUR_PINECONE_API_KEY --pinecone-index-name $YOUR_PINECONE_INDEX_NAME --pinecone-environment $YOUR_PINECONE_ENVIRONMENT --openai-api-key $OPENAI_API_KEY --output-parquet-path $OUTPUT_PARQUET_PATH  --docs-path $DOC_PATH --chunk-types "$CHUNK_TYPES" --chunk-sizes "$CHUNK_SIZES" --chunk-overlaps "$CHUNK_OVERLAPS"


**2.4 Create 50 synthetic queries/prompts via GPT-4, store them in a json**

To Do: See if we can generate a better set of 50+ Q/A pairs from the other package that uses the chunks/ pdfs to create intelligent questions.

In [None]:
# Custom code to get the queries - note that I iterated on this prompt a few times and tried few shot techniques etc. but it reduced the variety of prompts being created which I thought was interesting for embeddings analysis.

def  get_completion(prompt, model="gpt-4", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message["content"]


prompt = """ You are an assistant that only speaks in JSON format. Do not write normal text.

Create 5 random scenarios where a target person is asking for help dealing with a challenging sales or procurement negotiation tactic used by their opponent.

Ensure the scenario is in the form of a question from the perspective of a 'target person'.
Vary whether the target person is in sales or procurement, the type of tactics/strategy their counterparty is using, and how specific they are being. Use no more that 20 words per question.

Ensure the key used is 'text'."""
queries = get_completion(prompt)
queries_json = json.loads(queries)


**2.5. [Do not uncomment -for Abe only] Save queries_json in the local Git clone of theNegotiator**

In [None]:
## insert code to save file to GitHub Repo
file_name = f'{repo_path}/workingfiles/queries.json'
os.environ['FILE_NAME'] = file_name
os.makedirs(os.path.dirname(file_name), exist_ok=True)

with open(file_name, 'w') as file:
    json.dump(queries_json, file)

print(f'JSON file saved at: {file_name}')

In [None]:
with cwd(repo_path):
  !git status

**2.6. [Do not uncomment- works for Abe only]  Push Git Clone back to GitHub to update reference set of queries**



In [None]:
github_token = getpass("Enter your GitHub personal access token: ")

In [None]:
with cwd(repo_path):
  !git config --global user.email 'abemdxb@gmail.com'
  !git config --global user.name 'Abe Alappat'

  !git add -A
  !git commit -m "add queries.json"

  # Push to the remote repository using the token
  !git push "https://abemdxb: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@github.com/abemdxb/theNegotiator.git" --all

**2.5 Inspect Query Dataframe containing User Queries**

In [None]:
#downloading Abe's established set of queries into a dataframe - leave uncommented as it is meant to
query_df = pd.read_json(file_name)

#Inspect the query
query_df.head()
query_df.describe()

In [None]:
# create df with new queries to see if they match old ones
query_df2 = pd.DataFrame.from_dict(queries_json)

query_df2.head()
query_df2.describe()

**2.6. Create database_df for later use**

In [None]:
#Custom code

#things to check- text_vector_y -> does this exist as part of the OpenAIEmbedding object?

#load
database_df = pd.read_parquet(
    "/content/parq/knowledge_db.pq"
)
database_df.head(3)

In [None]:
database_df.describe()

#things to check- why are there 2x the number of vectors in the database than in the chunked data? Ask Pinecone community

# Step 3. Pipeline Creation

**Background:** Create classes for use in creation of the Langchain Pipeline. To save time I re-used the classes from the Arize Tutorial

**Instructions:** Run each cell. No additional inputs needed.

**3.1. Wrapper Class to create and store query and document embeddings**

In [None]:
# Sourced from Arize Pinecone Search and Retrieval Tutorial
class OpenAIEmbeddingsWrapper(OpenAIEmbeddings):
    """
    A wrapper around OpenAIEmbeddings that stores the query and document
    embeddings.
    """

    query_text_to_embedding: Dict[str, List[float]] = {}
    document_text_to_embedding: Dict[str, List[float]] = {}

    def embed_query(self, text: str) -> List[float]:
        embedding = super().embed_query(text)
        self.query_text_to_embedding[text] = embedding
        return embedding

    def embed_documents(self, texts: List[str], chunk_size: Optional[int] = 0) -> List[List[float]]:
        embeddings = super().embed_documents(texts, chunk_size)
        for text, embedding in zip(texts, embeddings):
            self.document_text_to_embedding[text] = embedding
        return embeddings

    @property
    def query_embedding_dataframe(self) -> pd.DataFrame:
        return self._convert_text_to_embedding_map_to_dataframe(self.query_text_to_embedding)

    @property
    def document_embedding_dataframe(self) -> pd.DataFrame:
        return self._convert_text_to_embedding_map_to_dataframe(self.document_text_to_embedding)

    @staticmethod
    def _convert_text_to_embedding_map_to_dataframe(
        text_to_embedding: Dict[str, List[float]]
    ) -> pd.DataFrame:
        texts, embeddings = map(list, zip(*text_to_embedding.items()))
        embedding_arrays = [np.array(embedding) for embedding in embeddings]
        return pd.DataFrame.from_dict(
            {
                "text": texts,
                "text_vector": embedding_arrays,
            }
        )


**3.2. Wrapper class to record retrieval data**

In [None]:
# Sourced from Arize Pinecone Search and Retrieval
class PineconeWrapper(Pinecone):
    query_text_to_document_score_tuples: Dict[str,List[Tuple[Document, float]]] = {}

    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        filter: Optional[dict] = None,
        namespace: Optional[str] = None,
    ) -> List[Tuple[Document, float]]:
        document_score_tuples = super().similarity_search_with_score(
            query=query,
            k=k,
            filter=filter,
            namespace=namespace,
        )
        #print(f"query in pinecone={query}")
        self.query_text_to_document_score_tuples[query] = document_score_tuples
        return document_score_tuples

    @property
    def retrieval_dataframe(self) -> pd.DataFrame:
        query_texts = []
        document_texts = []
        retrieval_ranks = []
        scores = []
        for query_text, document_score_tuples in self.query_text_to_document_score_tuples.items():
            for retrieval_rank, (document, score) in enumerate(document_score_tuples):
                query_texts.append(query_text)
                document_texts.append(document.page_content)
                retrieval_ranks.append(retrieval_rank)
                scores.append(score)
        return pd.DataFrame.from_dict(
            {
                "query_text": query_texts,
                "document_text": document_texts,
                "retrieval_rank": retrieval_ranks,
                "score": scores,
            }
        )

# Step 4. (Ignore for now) Create a modifiable system prompt

**Background:** Here I intended to create a system prompt which can guide the QA system's behavior.  This is for future experiments only

**Instructions:** Run each cell.

In [None]:
# Sources from https://smith.langchain.com/hub/rlm/rag-prompt
#rag_system_prompt = "hub.pull("rlm/rag-prompt")"
#print(rag_system_prompt)

#from langchain.prompts import PromptTemplate
#prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
#
#{context}
#
#Question: {question}
#Answer:"""
#PROMPT = PromptTemplate(
#    template=prompt_template, input_variables=["context", "question"]
#)
#
#chain_type_kwargs = {"prompt": PROMPT}
#qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs)


#try the following https://www.aitidbits.ai/p/advanced-prompting for one blog post


# Step 5. Test RAG Chain across multiple chunking strategies for a single test prompt

**Background:** Adaptation of code from the Arize Tutorial. Adjusted to enable search within a specific set of namespaces and output the results.

**Instructions:** Run each cell below. No need for any user input

In [None]:
#pull list of namespaces from python file- ensure there is an output somewhere with a json of namespaces?
directory_path = os.environ.get("OUTPUT_PARQUET_PATH")

parquet_files = [file for file in os.listdir(directory_path) if file.endswith('.pq')]

file_names_without_extension = [file.replace('.pq', '') for file in parquet_files]

namespaces = pd.DataFrame({'File_Name': file_names_without_extension})

namespaces.drop(namespaces[namespaces['File_Name'] == 'knowledge_db'].index, inplace=True)



In [None]:
print(f"namespaces from files:{namespaces}")
pindex=pinecone.Index(pineconce_index_name)
stat_dict= p_index.describe_index_stats()
list_of_namespaces=stat_dict["namespaces"]
print(f"namespaces from the index itself:{list_of_namespaces}")

In [None]:
# Adapted from Arize Pinecone Search and Retrieval
embedding_model_name = "text-embedding-ada-002"
num_retrieved_documents = 2
chat_model_name = "gpt-3.5-turbo"
embeddings = OpenAIEmbeddingsWrapper(model=embedding_model_name)
llm = ChatOpenAI(model_name=chat_model_name)
output_df = pd.DataFrame(columns=["namespace", "query_text", "query_embedding", "dimension", "response_text"])



for namespace in namespaces['File_Name']:
    docsearch = PineconeWrapper.from_existing_index(
        index_name = pinecone_index_name,
        embedding = embeddings,
        namespace = namespace  #New item
    )
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=docsearch.as_retriever(search_kwargs={"k": num_retrieved_documents}),
    )
    query_text = "Describe the hard ball negotiation tactic and provide an example. Use no more than 2 sentences."
    response_text = chain.run(query_text)
    retrievals_df = docsearch.retrieval_dataframe.tail(num_retrieved_documents)
    contexts = retrievals_df["document_text"].to_list()
    scores = retrievals_df["score"].to_list()
    query_embedding = embeddings.query_embedding_dataframe["text_vector"].iloc[-1]
    dimension=len(query_embedding)

    output_df.loc[len(output_df)] = [query_text, query_embedding, dimension, response_text, namespace]
    for i, (context,score) in enumerate(zip(contexts, scores)):
        output_df.at[len(output_df)-1,f"Retrieved Context {i+1}"]=contexts[i]
        output_df.at[len(output_df)-1,f"Retrieved Score {i+1}"]=scores[i]

    # Move the "namespace" column to the last position
    output_df = output_df[[col for col in output_df.columns if col != "namespace"] + ["namespace"]]

In [None]:
output_df.head()

# 6. Running RAG for all queries and prepping data for evaluations

**Background:** This section is where I build the data and normalize it.

**Instructions:** Run each cell below. No need for any user input.

In [None]:
#created this to be able to reset test_query_df without re-reunning everything above repeatedly.
test_query_df = query_df.copy()
test_query_df.head()

In [None]:
#Custom code. Note the chain sometimes freezes after a few rows- and I used tracing to figure out that it is the LLM span and have requested clarity from OpenAI.
for namespace in namespaces['File_Name']:
    docsearch = PineconeWrapper.from_existing_index(
        index_name = pinecone_index_name,
        embedding = embeddings,
        namespace = namespace  #New item
    )
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=docsearch.as_retriever(search_kwargs={"k": num_retrieved_documents}),
    )  #define the chain and docsearch:

    for i, row in test_query_df.iterrows():
        #print(f"i={i}")
        query_text = row["text"]
        #print(f"query={query_text}")
        response_text = chain.run(query_text)
        #print(f"response={response_text}")
        retrievals_df = docsearch.retrieval_dataframe.tail(num_retrieved_documents)
        contexts = retrievals_df["document_text"].to_list()
        scores = retrievals_df["score"].to_list()
        query_embedding = embeddings.query_embedding_dataframe["text_vector"].iloc[-1]

        if "text_vector" not in test_query_df.columns:
          test_query_df["text_vector"] = None
          test_query_df["text_vector"] = test_query_df["text_vector"].astype(object)
        test_query_df.at[i,f"text_vector"] = query_embedding

        test_query_df.at[i,f"response"] = response_text

        for n, context in enumerate(contexts):
            test_query_df.at[i,f"Context_text_{n}"] = context

        for n, score in enumerate(scores):
            test_query_df.at[i,f"Context_similarity_{n}"] = score



In [None]:
#Again created this to avoid having to rerun other cells above
query_with_response_df = test_query_df.copy()

In [None]:
#Simulated an annotation with some random numbers - full annotation was paused for now to save time
x = len(query_with_response_df)
query_with_response_df['user_feedback'] = np.random.choice([-1, 1], size=x)

**Center the embeddings in both the database and queries**

Note: I did not know we could use df["column"].mean() to average numpy nd arrays!

In [None]:
## Adapted code to account for namespaces
for namespace, group_df in database_df.groupby('namespace'):
    database_group_centroid = group_df["text_vector_x"].mean()
    database_df.loc[group_df.index, "centered_text_vector"] = (
        group_df["text_vector_x"].apply(lambda x: x - database_group_centroid)
    )
for namespace, group_df in query_with_response_df.groupby('namespace'):
    query_gp_centroid = group_df["text_vector"].mean()
    query_with_response_df.loc[group_df.index, "centered_text_vector"] = (
        group_df["text_vector"].apply(lambda x: x - query_gp_centroid)
    )

# 7. Run LLM Evaluations

**Background:** This section is where the results are evaluated using LLMs

**Instructions:** Run each cell below. No need for any user input.

In [None]:
#Sourced from the Arize Tutorial- just changed the df names
EVALUATION_SYSTEM_MESSAGE = (
    "You will be given a query and a reference text. "
    "You must determine whether the reference text contains an answer to the input query. "
    "Your response must be binary (0 or 1) and "
    "should not contain any text or characters aside from 0 or 1. "
    "0 means that the reference text does not contain an answer to the query. "
    "1 means the reference text contains an answer to the query."
)
QUERY_CONTEXT_PROMPT_TEMPLATE = """# Query: {query}

# Reference: {reference}

# Binary: """


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def evaluate_query_and_retrieved_context(query: str, context: str, model_name: str) -> str:
    prompt = QUERY_CONTEXT_PROMPT_TEMPLATE.format(
        query=query,
        reference=context,
    )
    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": EVALUATION_SYSTEM_MESSAGE},
            {"role": "user", "content": prompt},
        ],
        model=model_name,
    )
    return response["choices"][0]["message"]["content"]


def evaluate_retrievals(
    retrievals_data: Dict[str, str],
    model_name: str,
) -> List[str]:
    responses = []
    for query, retrieved_context in tqdm(retrievals_data.items()):
        response = evaluate_query_and_retrieved_context(query, retrieved_context, model_name)
        responses.append(response)
    return responses


def process_binary_responses(
    binary_responses: List[str], binary_to_string_map: Dict[int, str]
) -> List[str]:
    """
    Parse binary responses and convert to the desired format
    converts them to the desired format. The binary_to_string_map parameter
    should be a dictionary mapping binary values (0 or 1) to the desired
    string values (e.g. "irrelevant" or "relevant").
    """
    processed_responses = []
    for binary_response in binary_responses:
        try:
            binary_value = int(binary_response.strip())
            processed_response = binary_to_string_map[binary_value]
        except (ValueError, KeyError):
            processed_response = None
        processed_responses.append(processed_response)
    return processed_responses


eval_query_df = query_with_response_df.copy()
evaluation_model_name = "gpt-4"  # use GPT-4 if you have access
for context_index in range(num_retrieved_documents):
    retrievals_data = {
        row["text"]: row[f"Context_text_{context_index}"] for _, row in eval_query_df.iterrows()
    }
    raw_responses = evaluate_retrievals(retrievals_data, evaluation_model_name)
    processed_responses = process_binary_responses(raw_responses, {0: "irrelevant", 1: "relevant"})
    eval_query_df[f"openai_relevance_{context_index}"] = processed_responses
eval_query_df.head(1)

## 8. Compute Precision @K and relevance, open the Pheonix Session, and save/load data for easy analysis

**Background:** This section is where we calculate precision and launch the first pheonix session to be able to view the traces.

**Instructions:** Run each cell below. No need for any user input.


In [None]:
#Sourced from the Arize Tutorial

#what is this? feed into ChatGPT and comment - seems like this is calculating precision @K?

num_relevant_documents_array = np.zeros(len(eval_query_df))
num_retrieved_documents = 2
for retrieved_document_index in range(0, num_retrieved_documents):
    num_retrieved_documents = retrieved_document_index + 1
    num_relevant_documents_array += (
        eval_query_df[f"openai_relevance_{retrieved_document_index}"]
        .map(lambda x: int(x == "relevant"))
        .to_numpy()
    )
    eval_query_df[f"openai_precision@{num_retrieved_documents}"] = pd.Series(
        num_relevant_documents_array / num_retrieved_documents
    )

eval_query_df[
    [
        "openai_relevance_0",
        "openai_relevance_1",
        "openai_precision@1",
        "openai_precision@2",
    ]
]

In [None]:
# add in recall @K, MaP, RAGAS, MRR


In [None]:
# add in calculation of p50,90,95,99

In [None]:
# add in calculation of RPM and Char per min

**Launch session url to analyze the trace data**

In [None]:
#Custom code- Note I ran this cell in various locations to debug as I built the project up
session.url

**Save/Load data from my google drive**

 Note the actual file will be sent along with the other artifacts for the project. I did not set up a GCP account unfortunately!

In [None]:
# Custom code - it will be commmented out before submission.

drive.mount('/content/drive')

spans_df=px.active_session().get_spans_dataframe('span_kind == "RETRIEVER"')

file_path = "/content/drive/MyDrive/Arize/spans.pq"

spans_df.to_parquet(file_path, engine='pyarrow')

!ls '/content/drive/MyDrive/Arize/'




# 9. Launch a new session of Pheonix to analyze embeddings data

In [None]:
#Code from Arize Tutorial

query_schema = px.Schema(
    prompt_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="text",
        vector_column_name="centered_text_vector",
    ),
    response_column_names="response",
    tag_column_names=[
        "Context_text_0",
        "Context_similarity_0",
        "Context_text_1",
        "Context_similarity_1",
        "euclidean_distance_0",
        "euclidean_distance_1",
        "openai_relevance_0",
        "openai_relevance_1",
        "openai_precision@1",
        "openai_precision@2",
        "user_feedback",
    ],
)
database_schema = px.Schema(
    document_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="text",
        vector_column_name="centered_text_vector",
    ),
)

In [None]:
#Code from Arize Tutorial
database_ds = px.Dataset(
    dataframe=database_df,
    schema=database_schema,
    name="reference",
)
query_ds = px.Dataset(
    dataframe=eval_query_df,
    schema=query_schema,
    name="query",
)

In [None]:
#Code from Arize Tutorial
session2 = px.launch_app(query_ds, corpus=database_ds)

In [None]:
file_path2 = "/content/drive/MyDrive/Arize/eval_query.pq"

eval_query_df.to_parquet(file_path2, engine='pyarrow')


import shutil

source_path = '/content/parq/knowledge_db.pq'
destination_path = '/content/drive/MyDrive/Arize/knowledge_db.pq'

shutil.copy(source_path, destination_path)



!ls '/content/drive/MyDrive/Arize/'

# 10. Calculate Average Precision @K,P50,P90,P95 from the data and set the baseline for results from Fixed Chunking Size.

In [None]:
#Average precision @K
for namespace in namespaces:
