In [1]:
import pandas as pd
import os
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import linregress
import json
from plotly.subplots import make_subplots
from scipy.stats import f_oneway

### Read in the files and save them as dataframes in a dictionary

In [2]:
# reads in the question_answer_pairs parquet and retrieves the highest timestamp
#base_folder = "/Volumes/uc-catalog-dev/advancedanalytics-productai-dev/transformed_dev/llm-evaluation/" + datetime.now().strftime("%Y-%m-%d") + "/"
#df = pd.read_parquet(base_folder + "question_answer_pairs+productai_answers+evaluation_results_20240928203654_subset_run1.parquet")

def read_parquets_to_dict(folder_path):
    # Create an empty dictionary to store dataframes
    df_dict = {}
    
    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.parquet') and filename.startswith("complete_anonymized"):  # Check for parquet files
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            
            # Read the parquet file into a pandas DataFrame
            df = pd.read_parquet(file_path)
            
            # Add the DataFrame to the dictionary with the file name (without extension) as the key
            file_key = os.path.splitext(filename)[0]
            df_dict[file_key] = df
    
    return df_dict

base_folder = "/home/jovyan/LLM_evaluation/runs/"  # Replace with the path to your folder containing parquet files
dfs_dict = read_parquets_to_dict(base_folder)

# Optional: Check the keys in the dictionary
print("DataFrames loaded from parquet files:", list(dfs_dict.keys()))

DataFrames loaded from parquet files: ['complete_anonymized_results_run2_subset240928203654', 'complete_anonymized_results_run3_subset240928203654', 'complete_anonymized_results_run1_subset240928203654', 'complete_anonymized_results_240926', 'complete_anonymized_results_run5_3subset240928203654', 'complete_anonymized_results_240928203654', 'complete_anonymized_results_run5_subset240928203654', 'complete_anonymized_results_run5_2subset240928203654', 'complete_anonymized_results_240924', 'complete_anonymized_results_240928142914', 'complete_anonymized_results_run4_subset240928203654']


### Comparison of Product AI responses to the same questions
ANOVA = Analysis of Variance \
The F-statistic is the ratio of the variance between the groups to the variance within the groups. \
If the F-statistic is large, it suggests a significant difference between the group means. \
High p-value means insignificant differences. \
One-Way ANOVA: Compares the means of three or more groups based on one independent variable. For example, comparing test scores across different teaching methods.

In [3]:
productai_runs_keys = ['complete_anonymized_results_run1_subset240928203654', 'complete_anonymized_results_run2_subset240928203654', 
                       'complete_anonymized_results_run3_subset240928203654', 'complete_anonymized_results_run4_subset240928203654', 
                       'complete_anonymized_results_run5_subset240928203654']

# Create a subset dictionary
productai_runs_dict = {key: dfs_dict[key] for key in productai_runs_keys if key in dfs_dict}

for df in productai_runs_dict.values():
    df.sort_values(by='question', inplace=True)

# Extract evaluation scores from each DataFrame
scores_list = [df['evaluation_score'].values for df in productai_runs_dict.values()]

# Perform ANOVA
anova_result = f_oneway(*scores_list)

# Output the results
print("ANOVA F-statistic:", anova_result.statistic)
print("ANOVA p-value:", anova_result.pvalue)

ANOVA F-statistic: 0.016651206409720593
ANOVA p-value: 0.9994555390362249


### Count how often there was a difference in the evaluation score in one of the five documents.

In [4]:
# Merge DataFrames on the 'question' column
merged_df = productai_runs_dict['complete_anonymized_results_run1_subset240928203654'][['question']].copy()  # Start with 'question' column from the first DataFrame

# Add evaluation scores from each DataFrame to the merged DataFrame
for name, df in productai_runs_dict.items():
    merged_df[name] = df.sort_values(by='question')['evaluation_score'].values

# Check for questions where the evaluation scores are different across DataFrames
merged_df['different_scores'] = merged_df.iloc[:, 1:].nunique(axis=1) > 1

# Count how many questions have different evaluation scores
different_questions_count = merged_df['different_scores'].sum()

print("Number of questions with different evaluation scores:", different_questions_count)

Number of questions with different evaluation scores: 40


### Comparison of Evaluation results to the same Product AI responses

In [5]:
evaluation_runs_keys = ['complete_anonymized_results_run5_subset240928203654', 'complete_anonymized_results_run5_2subset240928203654',
                        'complete_anonymized_results_run5_3subset240928203654']

# Create a subset dictionary
evaluation_runs_dict = {key: dfs_dict[key] for key in evaluation_runs_keys if key in dfs_dict}

for df in evaluation_runs_dict.values():
    df.sort_values(by='question', inplace=True)

# Extract evaluation scores from each DataFrame
scores_list = [df['evaluation_score'].values for df in evaluation_runs_dict.values()]

# Perform ANOVA
anova_result = f_oneway(*scores_list)

# Output the results
print("ANOVA F-statistic:", anova_result.statistic)
print("ANOVA p-value:", anova_result.pvalue)

ANOVA F-statistic: 0.13172600989940925
ANOVA p-value: 0.8766323207889749


### Count how often there was a difference in the evaluation score in one of the three documents.

In [6]:
# Merge DataFrames on the 'question' column
merged_df = evaluation_runs_dict['complete_anonymized_results_run5_subset240928203654'][['question']].copy()  # Start with 'question' column from the first DataFrame

# Add evaluation scores from each DataFrame to the merged DataFrame
for name, df in evaluation_runs_dict.items():
    merged_df[name] = df.sort_values(by='question')['evaluation_score'].values

# Check for questions where the evaluation scores are different across DataFrames
merged_df['different_scores'] = merged_df.iloc[:, 1:].nunique(axis=1) > 1

# Count how many questions have different evaluation scores
different_questions_count = merged_df['different_scores'].sum()

print("Number of questions with different evaluation scores:", different_questions_count)

Number of questions with different evaluation scores: 25
