# Documentation
This is a notebook to add additional columns to the result dataframes such as answer_length, product_ai_response_length and vectorsearch_works, which is a boolean that checks whether vectorsearch worked for the question.

In [None]:
import pandas as pd
import os
from datetime import datetime
import json
from databricks.vector_search.client import VectorSearchClient

In [None]:
# reads in the question_answer_pairs parquet and retrieves the highest timestamp

base_folder = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/" + datetime.now().strftime("%Y-%m-%d") + "/"

# Get all Question-Answer-Response triplet parquets
file_names = [f for f in os.listdir(base_folder) if f.startswith("question_answer_pairs+product_ai_answers") and f.endswith(".parquet")]

# Retrieve the QAR-triplet parquet with the highest timestamp (most recent run)
file_name = max(file_names, key=lambda x: x.split('_')[5].split('.')[0])

path = os.path.join(base_folder, file_name)

timestamp = file_name.split('_')[5].split('.')[0]
print("Timestamp:", timestamp)

In [None]:
# reads in the parquet and retrieves the highest timestamp

datetime = datetime.now().strftime("%Y-%m-%d")
base_folder = f"/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/{datetime}/"

# Get all of the final parquets
file_names = [f for f in os.listdir(base_folder) if f.startswith("question_answer_pairs+product_ai_answers+evaluation_results") and f.endswith(".parquet")]

# Retrieve parquet with highest timestamp (most recent run)
file_name = max(file_names, key=lambda x: x.split('_')[5].split('.')[0])

path = os.path.join(base_folder, file_name)

timestamp = file_name.split('_')[5].split('.')[0]
print("Timestamp:", timestamp)

In [None]:
workspace_url = os.environ.get("WORKSPACE_URL")
sp_client_id = os.environ.get("SP_CLIENT_ID")
sp_client_secret = os.environ.get("SP_CLIENT_SECRET")
vsc = VectorSearchClient(
    workspace_url=workspace_url,
    service_principal_client_id=sp_client_id,
    service_principal_client_secret=sp_client_secret
)
index = vsc.get_index(
    endpoint_name="vectorsearch-dev",
    index_name="uc-catalog-dev.advancedanalytics-validationservices-dev.validation_services_index_20240919_085338"
)

In [None]:
# Retrieves the results of vectorsearch for one row
def check_document_in_similar_docs(row):
    query = row["question"]
    document = row["document"]
    similar_docs = index.similarity_search(
        num_results=10,
        columns=["text"],
        query_text=query
    )
    source_titles_and_scores = [
        (json.loads(doc[0])['source_title'], doc[1])
        for doc in similar_docs['result']['data_array']
    ]
    source_titles = list(set(title for title, _ in source_titles_and_scores))
    similarity_scores = list(set(score for _, score in source_titles_and_scores))
    return any(document in source_title for source_title in source_titles), source_titles, similarity_scores

# Adds several columns that are important for the visualization of the results
def add_columns(df):
    df[['vectorsearch_works', 'vectorsearch_source_titles', 'vectorsearch_similarity_scores']] = \
    df.apply(check_document_in_similar_docs, axis=1, result_type='expand')
    df['productai_response_time'] = df['productai_response_time'].astype(float)
    df['evaluation_score'] = df['evaluation_score'].astype(float)
    df['productai_response_length'] = df['productai_response'].apply(len)
    df['answer_length'] = df['answer'].apply(len)
    return df

In [None]:
df = pd.read_parquet(base_folder + f"question_answer_pairs+productai_answers+evaluation_results_{timestamp}.parquet")

df = add_columns(df)

df.to_parquet(base_folder + f"curated_results_{timestamp}.parquet")
display(df)

In [None]:
# Functions that count the number of tokens generated by LLMs
def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-4")
    return len(encoding.encode(text))
def tokenize_df(df):
    for column in df.columns:
        df[column] = df[column].astype(str).apply(count_tokens)
    return df

In [None]:
token_df = tokenize_df(df)
token_df.to_parquet(base_folder + f"tokenized_df_{timestamp}.parquet")
display(token_df)