### Documentation 17.09.2024
This script looks for which documents there were Question-Answer pairs generated today in 01-generate_qa_pairs. For each document it reads in the Question-Answer pairs parquets. Then it goes through each pair and prompts Product AI with the question. It then writes the response into the same folder, where the Question-Answer pair lies.

If you want to run this script you will have to update the cookie. To get the cookie follow this process:
1. Use Microsoft Edge (also applicable for Firefox, but positional instructions may vary)
2. Go to https://app-validation-services-dev.azurewebsites.net/ and login.
3. Right click and select "Untersuchen" or "Inspect" depending on the language.
4. Select the "Netzwerk" or "Network" icon in the top. It looks like the WiFi icon.
5. Type in any prompt in the chat, such as "hello". On the right, a request will pop up with the name "chat".
6. Click on the request and in the headers you will find the cookie.
7. Copy & paste the cookie into the cell below.

In [None]:
cookie = "<insert_cookie>"


In [None]:
from objects.product_ai_prompter import ProductAIPrompter
import pandas as pd
from datetime import datetime
import os
from tqdm import tqdm

### Paths to Files

In [None]:
# reads in the question_answer_pairs parquet and retrieves the parquet with the highest timestamp

base_folder = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/" + datetime.now().strftime("%Y-%m-%d") + "/"

# get all QA-pair parquets
file_names = [f for f in os.listdir(base_folder) if f.startswith("question_answer_pairs_") and f.endswith(".parquet")]
print("These are the evaluation sessions from today:", file_names)

# Retrieve the QA-pair parquet with the highest timestamp (most recent run)
file_name = max(file_names, key=lambda x: x.split('_')[3].split('.')[0])
print("This is the current evaluation session:", file_name)

path = os.path.join(base_folder, file_name)

timestamp = file_name.split('_')[3].split('.')[0]
print("Timestamp:", timestamp)

### Für Rustam: Der Code hierüber wird wahrscheinlich nicht funktionieren, deswegen habe ich dir den Code hier geschrieben.

In [None]:
base_folder = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/" + datetime.now().strftime("%Y-%m-%d") + "/"
path = <hier_den_pfad_des_question_answer_pair_dokumentes_einfügen>
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

### Prompt ProductAI to answer a list of questions

In [None]:
# note: question 0 is useless because it asked a question on the table of contents which is provided on the first page
df = pd.read_parquet(path)
prompter = ProductAIPrompter(cookie)

grouped_df = df.groupby('document')
all_groups = []

for document, group in tqdm(grouped_df):
    doc_responses = []
    doc_response_times = []

    # Prompt the chatbot under evaluation with the questions
    for _, row in group.iterrows():
        question = row['question']
        response, response_time = prompter.prompt_productai(question)
        doc_responses.append(response)
        doc_response_times.append(response_time)
    
    group['productai_response'] = doc_responses
    group['productai_response_time'] = doc_response_times
    save_path = base_folder + timestamp + f"/question_answer_pairs+product_ai_answers_{document}.parquet"
    group.to_parquet(save_path, index=False)
    
    all_groups.append(group)

# Saving
df = pd.concat(all_groups)
save_path = f"{base_folder}/question_answer_pairs+product_ai_answers_{timestamp}.parquet"
df.to_parquet(save_path, index=False)
display(df)

#### If the code above fails, you can run the code below to keep the progress and start from where you left off
#### Make sure to change the variable timestamp to the name of the folder where the progress is saved

In [None]:
# timestamp = "20240928203654"

In [None]:
'''
df = pd.read_parquet(path)
prompter = ProductAIPrompter(cookie)

all_groups = []
grouped_df = df.groupby('document')

# Read in all previously saved dataframes
saved_files = [os.path.join(save_folder, timestamp, f) for f in os.listdir(os.path.join(save_folder, timestamp)) \
               if f.startswith('question_answer_pairs+productai_answers') and f.endswith('.parquet')]
for file in saved_files:
    df = pd.read_parquet(file)
    all_groups.append(df)
    
# Get the list of already processed documents
processed_docs = [os.path.basename(f).replace('question_answer_pairs_', '').replace('.parquet', '') for f in saved_files]

# filter the dataframe on the remaining files
remaining_df = grouped_df.filter(lambda x: x.name not in processed_docs)

for document, group in tqdm(remaining_df):
    doc_responses = []
    doc_response_times = []

    # Prompt the chatbot under evaluation with the questions
    for _, row in group.iterrows():
        question = row['question']
        response, response_time = prompter.prompt_productai(question)
        doc_responses.append(response)
        doc_response_times.append(response_time)
    
    group['productai_response'] = doc_responses
    group['productai_response_time'] = doc_response_times
    save_path = base_folder + timestamp + f"/question_answer_pairs+product_ai_answers_{document}.parquet"
    group.to_parquet(save_path, index=False)
    
    all_groups.append(group)

# Saving
df = pd.concat(all_groups)
save_path = f"{base_folder}/question_answer_pairs+product_ai_answers_{timestamp}.parquet"
df.to_parquet(save_path, index=False)
display(df)
'''