### Documentation 17.09.2024
This script looks for which documents there were generated prompts by Product AI in 02-generate_productai_response. It reads in each of the parquets that were saved previously, which contain the columns question, answer, chunk, document, productai_response. It then evaluates accurate the "productai_response" entries are by comparing them to the "answer" entries.

In [None]:
import os
import pandas as pd
from datetime import datetime
from objects.evaluator import Evaluator
from tqdm import tqdm
import time
from langchain_openai.chat_models import AzureChatOpenAI

In [None]:
# reads in the question_answer_pairs parquet and retrieves the highest timestamp

base_folder = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/" + datetime.now().strftime("%Y-%m-%d") + "/"

# Get all Question-Answer-Response triplet parquets
file_names = [f for f in os.listdir(base_folder) if f.startswith("question_answer_pairs+product_ai_answers") and f.endswith(".parquet")]

# Retrieve the QAR-triplet parquet with the highest timestamp (most recent run)
file_name = max(file_names, key=lambda x: x.split('_')[5].split('.')[0])

path = os.path.join(base_folder, file_name)

timestamp = file_name.split('_')[5].split('.')[0]
print("Timestamp:", timestamp)

### Evaluate correctness of ProductAI response

In [None]:
df = pd.read_parquet(path)
model = "evaluation_gpt4o"
api_key = dbutils.secrets.get(scope='keyvault-link', key='azure-openai-api-key')

all_groups = []
grouped_df = df.groupby('document')

for document, group in tqdm(grouped_df):
    doc_scores = []
    doc_reasonings = []

    # Evaluate the score of the chatbot responses
    for _, row in group.iterrows():
        evaluator = Evaluator(row['question'], row['answer'], row['productai_response'], model, api_key)
        response = evaluator.evaluate_correctness()
        doc_scores.append(response['score'])
        doc_reasonings.append(response['reasoning'])
    
    group['evaluation_score'] = doc_scores.astype(float)
    group['evaluation_reasoning'] = doc_reasonings
    save_path = base_folder + timestamp + f"/question_answer_pairs+productai_answers+evaluation_results_{document}.parquet"
    group.to_parquet(save_path, index=False)
    
    all_groups.append(group)

# Saving
df = pd.concat(all_groups)
save_path = base_folder + f"question_answer_pairs+productai_answers+evaluation_results_{timestamp}.parquet"
df.to_parquet(save_path, index=False)
display(df)

#### If the code above fails, you can run the code below to keep the progress and start from where you left off
#### Make sure to change the variable timestamp to the name of the folder where the progress is saved

In [None]:
# timestamp = "20240928203654"

In [None]:
'''
df = []
model = "evaluation_gpt4o"
api_key = dbutils.secrets.get(scope='keyvault-link', key='azure-openai-api-key')

all_groups = []
grouped_df = df.groupby('document')

# Read in all previously saved dataframes
saved_files = [os.path.join(save_folder, timestamp, f) for f in os.listdir(os.path.join(save_folder, timestamp)) \
               if f.startswith('question_answer_pairs+productai_answers+evaluation_results_') and f.endswith('.parquet')]
for file in saved_files:
    df = pd.read_parquet(file)
    all_groups.append(df)
    
# Get the list of already processed documents
processed_docs = [os.path.basename(f).replace('question_answer_pairs_', '').replace('.parquet', '') for f in saved_files]

# filter the dataframe on the remaining files
remaining_df = grouped_df.filter(lambda x: x.name not in processed_docs)

for document, group in tqdm(remaining_df):
    doc_scores = []
    doc_reasonings = []

    # Evaluate the score of the chatbot responses
    for _, row in group.iterrows():
        evaluator = Evaluator(row['question'], row['answer'], row['productai_response'], model, api_key)
        response = evaluator.evaluate_correctness()
        doc_scores.append(response['score'])
        doc_reasonings.append(response['reasoning'])
    
    group['evaluation_score'] = doc_scores.astype(float)
    group['evaluation_reasoning'] = doc_reasonings
    save_path = base_folder + timestamp + f"/question_answer_pairs+productai_answers+evaluation_results_{document}.parquet"
    group.to_parquet(save_path, index=False)
    
    all_groups.append(group)

# Saving
df = pd.concat(all_groups)
save_path = base_folder + f"question_answer_pairs+productai_answers+evaluation_results_{timestamp}.parquet"
df.to_parquet(save_path, index=False)
display(df)
'''