In [3]:
import os
import time
import pinecone
import pandas as pd

from wrapper.openAiEmbeddingWrapper import OpenAIEmbeddingsWrapper
from wrapper.pineconeWrapper import PineconeWrapper

from utils.evaluation import evaluate_retrievals, process_binary_responses
from utils.customPrompt import get_custom_chain_type_args

from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI

  from tqdm.autonotebook import tqdm


In [4]:
MODEL = "text-embedding-ada-002"

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
pinecone_index_name = os.getenv("PINECONE_INDEX")
pinecone_namespace = os.getenv("PINECONE_NAMESPACE")

pinecone.init(
    api_key=pinecone_api_key,
    environment=pinecone_env,
)
index = pinecone.Index(pinecone_index_name)

chat_model_name = "gpt-3.5-turbo"
evaluation_model_name = "gpt-3.5-turbo"
num_retrieved_documents = 2

embeddings = OpenAIEmbeddingsWrapper(model=MODEL)
docsearch = PineconeWrapper.from_existing_index(
    index_name=pinecone_index_name,
    embedding=embeddings,
    namespace=pinecone_namespace
)


llm = ChatOpenAI(temperature=0.5, model_name=chat_model_name,
                 request_timeout=30)
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    chain_type_kwargs=get_custom_chain_type_args()
)

In [11]:
def generate_query_df(prompts, chain, docsearch, embeddings, num_retrieved_documents=2):
    query_df_data = {"text": [], "text_vector": [],
                     "response": []  # "user_feedback": []
                     }

    for context_index in range(num_retrieved_documents):
        query_df_data[f"context_text_{context_index}"] = []
        query_df_data[f"context_similarity_{context_index}"] = []

    for index, prompt in enumerate(prompts):
        print('progress => ', index, '/', len(prompts))
        time.sleep(1)  # Solve hang issue

        response_text = chain.run(prompt)  # chain.run(prompt)
        print('.')
        retrievals_df = docsearch.retrieval_dataframe.tail(
            num_retrieved_documents)
        print('..')
        contexts = retrievals_df["document_text"].to_list()
        scores = retrievals_df["score"].to_list()
        query_embedding = embeddings.query_embedding_dataframe["text_vector"].iloc[-1]
        print('...')

        query_df_data["text"].append(prompt)
        query_df_data["text_vector"].append(query_embedding.tolist())
        query_df_data["response"].append(response_text)
        # query_df_data["user_feedback"].append(float("nan"))  # TODO: Implement

        for context_index in range(num_retrieved_documents):
            context_text = contexts[context_index] if context_index < len(
                contexts) else ""
            score = scores[context_index] if context_index < len(
                scores) else float(0)

            query_df_data[f"context_text_{context_index}"].append(context_text)
            query_df_data[f"context_similarity_{context_index}"].append(score)

    return pd.DataFrame(query_df_data)


def generate_response_df(prompt, num_retrieved_documents=2):
    query_df = generate_query_df(
        prompts=[prompt],
        chain=chain,
        docsearch=docsearch,
        embeddings=embeddings,
        num_retrieved_documents=num_retrieved_documents
    )

    return query_df


In [13]:
response_df = generate_response_df('Do you have adidas shoes for sale?')
response_obj = response_df.to_json(orient='records')

response_df.head()

progress =>  0 / 1
.
query_text = Do you have adidas shoes for sale?
                           query_text  \
2  Do you have adidas shoes for sale?   
3  Do you have adidas shoes for sale?   

                                       document_text  retrieval_rank     score  
2  Title: ADIDAS | SUPERSTAR 80S Description: The...               2  0.769135  
3  Title: ADIDAS | CLASSIC BACKPACK | LEGEND INK ...               3  0.759331  
..
...


Unnamed: 0,text,text_vector,response,context_text_0,context_similarity_0,context_text_1,context_similarity_1
0,Do you have adidas shoes for sale?,"[-0.0005319537922695433, -0.023252987804327257...","Yes, we have a wide selection of adidas shoes ...",Title: ADIDAS | SUPERSTAR 80S Description: The...,0.769135,Title: ADIDAS | CLASSIC BACKPACK | LEGEND INK ...,0.759331


In [22]:
import phoenix as px

from utils.prep import generate_products_json
from utils.pinecone import upsert_doc, get_vectorstores_data
from utils.model import generate_response_df, evaluate_response_df
from utils.mongodb import get_documents, update_documents
from utils.arizeAI import get_arize_url
from utils.arizeAI import data_prep

database_df = get_vectorstores_data()
query_df = pd.DataFrame(list(get_documents()))

px.close_app()

[query_df, database_df] = data_prep(query_df, database_df)

query_schema = px.Schema(
    prompt_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="text",
        vector_column_name="centered_text_vector",
    ),
    response_column_names="response",
    tag_column_names=[
        "context_text_0",
        "context_similarity_0",
        "context_text_1",
        "context_similarity_1",
        # "euclidean_distance_0",
        # "euclidean_distance_1",
        "openai_relevance_0",
        "openai_relevance_1",
        "openai_precision@1",
        "openai_precision@2",
    ],
)

database_schema = px.Schema(
    document_column_names=px.EmbeddingColumnNames(
        raw_data_column_name="text",
        vector_column_name="centered_text_vector",
    ),
)

# Dataset
prim_ds = px.Dataset(
    dataframe=query_df,
    schema=query_schema,

    name="query",
)
corpus_ds = px.Dataset(
    dataframe=database_df,
    schema=database_schema,
    name="pinecone",
)

session = px.launch_app(
    primary=prim_ds, corpus=corpus_ds, port=6060)

print('session =', session)

https://b2a2-2001-fb1-9d-ffb6-c1b0-1480-c025-ab6c.ngrok-free.app 6060
🌍 To view the Phoenix app in your browser, visit http://127.0.0.1:6060/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix
session = <phoenix.session.session.ThreadSession object at 0x13b806400>
