### Documentation 17.09.2024
This script takes a list of documents and generats Question-Answer pairs for each document. It does this in a few steps for each document:
1. Chunk Document into chapters
2. Generate two Question-Answer pairs for each chapter

For testing purposes I currently take only a few documents, which you can change in cell 5.

In [None]:
from objects.qa_pair_generator import QAPairGenerator
import pandas as pd
from datetime import datetime
import os
from tqdm import tqdm

### Paths to Documents

In [None]:
# This will be moved to 00-main later on

# Folder that contains all documents
folder_path = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/filters/katharina_keese/pdf/2024-07-17_validations_poc/ex_and_val_guides/gpt-vision-pipe/document-twins/2024-08-14"

# Get all .md files from the folder
document_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.md')]

# Create folder that will contain all results for the evaluation session
save_folder = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/" + datetime.now().strftime("%Y-%m-%d") + "/"
os.makedirs(os.path.join(save_folder, timestamp), exist_ok=True)

### Generate QA Pairs

In [None]:
model_name = "evaluation_gpt4o" # the model used for QA-pair generation
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
combined_df = []

for path in tqdm(document_paths):
    generator = QAPairGenerator(path, model_name, dbutils.secrets.get(scope='keyvault-link', key='azure-openai-api-key'))
    qa_pairs = generator.generate_qa_pairs(2)
    df = pd.DataFrame(qa_pairs)
    df.to_parquet(os.path.join(save_folder, timestamp, f"question_answer_pairs_{os.path.basename(path).replace('.md', '')}.parquet"), index=False)
    combined_df.append(df)

# Saving
combined_df = pd.concat(combined_df)
combined_df.to_parquet(f"{save_folder}question_answer_pairs_{timestamp}.parquet", index=False)
display(combined_df)

#### If the code above fails, you can run the code below to keep the progress and start from where you left off
#### Make sure to change the variable timestamp to the name of the folder where the progress is saved

In [None]:
# timestamp = "20240928203654"

In [None]:
'''
combined_df = []

# Read in all previously saved dataframes
saved_files = [os.path.join(save_folder, timestamp, f) for f in os.listdir(os.path.join(save_folder, timestamp)) if f.endswith('.parquet')]
for file in saved_files:
    df = pd.read_parquet(file)
    combined_df.append(df)
    
# Get the list of already processed documents
processed_docs = [os.path.basename(f).replace('question_answer_pairs_', '').replace('.parquet', '') for f in saved_files]

# Filter out the already processed documents
remaining_paths = [path for path in document_paths if os.path.basename(path).replace('.md', '') not in processed_docs]
print(len(remaining_paths))

for path in tqdm(remaining_paths):
    generator = QAPairGenerator(path, model_name, dbutils.secrets.get(scope='keyvault-link', key='azure-openai-api-key'))
    qa_pairs = generator.generate_qa_pairs(2)
    df = pd.DataFrame(qa_pairs)
    df.to_parquet(os.path.join(save_folder, timestamp, f"question_answer_pairs_{os.path.basename(path).replace('.md', '')}.parquet"), index=False)
    combined_df.append(df)
combined_df = pd.concat(combined_df)
combined_df.to_parquet(f"{save_folder}question_answer_pairs_{timestamp}.parquet", index=False)
combined_df
'''