# Validate LLM responses

In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import jensenshannon

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

Compares files in ground-truth folder, and predictions to find missing files, utterances or incorrect decision output

In [None]:
def check_folders(predictions_folder, ground_truth_folder):
    pred_files = {os.path.basename(f) for f in glob.glob(os.path.join(predictions_folder, '*.csv'))}
    gt_files = {os.path.basename(f) for f in glob.glob(os.path.join(ground_truth_folder, '*.csv'))}

    logging.info(f"Number of files in predictions folder: {len(pred_files)}")
    logging.info(f"Number of files in ground truth folder: {len(gt_files)}")

    missing_in_pred = gt_files - pred_files
    missing_in_gt = pred_files - gt_files

    if missing_in_pred:
        logging.warning(f"Files missing in predictions: {missing_in_pred}")
    if missing_in_gt:
        logging.warning(f"Files missing in ground truth: {missing_in_gt}")

    common_files = pred_files.intersection(gt_files)

    for file in common_files:
        predictions_file_path = os.path.join(predictions_folder, file)
        ground_truth_file_path = os.path.join(ground_truth_folder, file)
        
        predictions_df = pd.read_csv(predictions_file_path)
        ground_truth_df = pd.read_csv(ground_truth_file_path)

        ground_truth_df = ground_truth_df[ground_truth_df['Utterance'].str.contains("^\\d+\\. Bot:", regex=True)]

        if len(predictions_df) != len(ground_truth_df):
            logging.warning(f"Mismatch in the number of bot responses in file: {file}. Predictions have {len(predictions_df)}, Ground Truth has {len(ground_truth_df)}")
            
        if not all(predictions_df['decision'].str.lower().isin(['breakdown', 'non-breakdown'])):
            logging.warning(f"Invalid decision in file: {file}.")
    return len(pred_files), len(gt_files), missing_in_pred, missing_in_gt


In [None]:
def decision_to_binary(decision):
    """
    Converts decision text to binary.
    If the decision contains 'non-breakdown', it returns 0.
    Otherwise, it returns 1.
    """
    if 'non-breakdown' in decision.lower():
        return 0
    else:
        return 1

Used to check individual files to verify if functions are all good

In [None]:
def evaluate_predictions(predictions_file, ground_truth_file):
    predictions_df = pd.read_csv(predictions_file)
    ground_truth_df = pd.read_csv(ground_truth_file)

    # preprocess df to only include Bot utterances for ground truth
    ground_truth_df = ground_truth_df[ground_truth_df['Utterance'].str.contains("^\\d+\\. Bot:", regex=True)]
    # extract line numbers to align both DataFrames
    ground_truth_df['Line Number'] = ground_truth_df['Utterance'].apply(lambda x: int(x.split('.')[0]))
    predictions_df['Line Number'] = predictions_df['segment'].apply(lambda x: int(x.split('.')[0]))

    # merge on Line Number
    merged_df = pd.merge(predictions_df, ground_truth_df, on='Line Number', how='left')

    merged_df['Prediction'] = merged_df['decision'].apply(decision_to_binary)
    merged_df['Truth'] = merged_df['Majority Voting'].map({'non-breakdown': 0, 'breakdown': 1})
    merged_df = merged_df.dropna(subset=['Majority Voting']) # used to drop the first bot response in some cases (NaN)
    merged_df['Truth'] = merged_df['Truth'].astype(int)

    accuracy = accuracy_score(merged_df['Truth'], merged_df['Prediction'])
    recall = recall_score(merged_df['Truth'], merged_df['Prediction'])
    precision = precision_score(merged_df['Truth'], merged_df['Prediction'])
    f1 = f1_score(merged_df['Truth'], merged_df['Prediction'])
    mse = mean_squared_error(merged_df['Truth'], merged_df['Prediction'])

    # For JS Divergence, convert binary classifications to a basic probability distribution for each case
    predictions_prob = merged_df['Prediction'].value_counts(normalize=True).reindex([0, 1]).fillna(0)
    ground_truth_prob = merged_df['Truth'].value_counts(normalize=True).reindex([0, 1]).fillna(0)
    js_divergence = (jensenshannon(predictions_prob, ground_truth_prob)**2)

    return accuracy, recall, precision, f1, js_divergence, mse

In [None]:
def evaluate_folder(predictions_folder, ground_truth_folder):
    predictions_files = glob.glob(os.path.join(predictions_folder, '*.csv'))
    ground_truth_files = glob.glob(os.path.join(ground_truth_folder, '*.csv'))

    predictions_set = set([os.path.basename(f) for f in predictions_files])
    ground_truth_set = set([os.path.basename(f) for f in ground_truth_files])

    common_files = predictions_set.intersection(ground_truth_set)

    total_accuracy, total_recall, total_precision, total_f1, total_js_divergence, total_mse = [], [], [], [], [], []

    total_squared_error = 0.0
    total_count = 0
    total_correct = 0
 

    for file in common_files:
        predictions_file_path = os.path.join(predictions_folder, file)
        ground_truth_file_path = os.path.join(ground_truth_folder, file)
        
        accuracy, recall, precision, f1, js_divergence, mse = evaluate_predictions(predictions_file_path, ground_truth_file_path)
        
        total_accuracy.append(accuracy)
        total_recall.append(recall)
        total_precision.append(precision)
        total_f1.append(f1)
        total_js_divergence.append(js_divergence)
        total_mse.append(mse)

        predictions_df = pd.read_csv(predictions_file_path)
        ground_truth_df = pd.read_csv(ground_truth_file_path)

        ground_truth_df = ground_truth_df[ground_truth_df['Utterance'].str.contains("^\\d+\\. Bot:", regex=True)]

        ground_truth_df['Line Number'] = ground_truth_df['Utterance'].apply(lambda x: int(x.split('.')[0]))
        predictions_df['Line Number'] = predictions_df['segment'].apply(lambda x: int(x.split('.')[0]))

        merged_df = pd.merge(predictions_df, ground_truth_df, on='Line Number', how='left')

        merged_df['Prediction'] = merged_df['decision'].apply(decision_to_binary)
        merged_df['Truth'] = merged_df['Majority Voting'].map({'non-breakdown': 0, 'breakdown': 1})
        merged_df = merged_df.dropna(subset=['Majority Voting'])
        merged_df['Truth'] = merged_df['Truth'].astype(int)

        correct = (merged_df['Prediction'] == merged_df['Truth']).sum()
        squared_errors = merged_df.apply(lambda row: (row['Prediction'] - row['Truth']) ** 2, axis=1)

        total_squared_error += squared_errors.sum()
        total_count += len(squared_errors)
        total_correct += correct

    avg_accuracy = sum(total_accuracy) / len(total_accuracy)
    avg_recall = sum(total_recall) / len(total_recall)
    avg_precision = sum(total_precision) / len(total_precision)
    avg_f1 = sum(total_f1) / len(total_f1)
    avg_js_divergence = sum(total_js_divergence) / len(total_js_divergence)
    avg_mse = sum(total_mse) / len(total_mse)
    combined_mse = total_squared_error / total_count if total_count > 0 else float('nan')

    print(f"Average Accuracy: {avg_accuracy:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average F1 Score: {avg_f1:.4f}")
    print(f"Average JS Divergence: {avg_js_divergence:.4f}")
    print(f"Average MSE: {avg_mse:.4f}")
    print(f"Combined MSE: {combined_mse:.4f}")

    #return total_accuracy, total_recall, total_precision, total_f1, total_js_divergence, total_mse

In [None]:
predictions_folder = 'results/two_shot/gpt3'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/zero_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_CoT/gpt3'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)

# Zero Shot Results

In [None]:
predictions_folder = 'results/zero_shot_v1/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/zero_shot/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/zero_shot_v1/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/zero_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

# CoT

Step-by-step prompt:

    Your analysis should include:
    - A step-by-step reasoning process explaining how you reached your conclusion.
    - A final decision categorized as either 'BREAKDOWN' or 'NON-BREAKDOWN'.
    - A score from 0 to 1, where 0 represents a total breakdown and 1 indicates a completely smooth conversation.


In [None]:
predictions_folder = 'results/CoT_v1/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT_v1/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

### Long Guided Step-By-Step Prompt to rationalize the thinking 

In [None]:
predictions_folder = 'results/CoT_long/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT_long/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

Final CoT Prompt:

            - Before answering, you should think through the question step-by-step.
            - Explain your reasoning at each step towards making your final decision.

In [None]:
predictions_folder = 'results/CoT/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT_long/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT_v1/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT_v2/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

# One Shot - BREAKDOWN

In [None]:
predictions_folder = 'results/one_shot_b/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/one_shot/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/one_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_v1/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_v2/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_v2/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_v3/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

# Two Shot - v5

In [None]:
predictions_folder = 'results/two_shot/gpt3'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)

# Two-Shot CoT

In [None]:
predictions_folder = 'results/two_shot_CoT/gpt3'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/four_shot/gpt3'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/four_shot/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/four_shot_CoT/gpt3'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/four_shot_CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/zero_shot/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/four_shot/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/two_shot_CoT/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/four_shot_CoT/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT/medium'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/anal_reason/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = 'results/CoT/gpt4'
ground_truth_folder = 'db/eval_labelled'
check_folders(predictions_folder, ground_truth_folder)
evaluate_folder(predictions_folder, ground_truth_folder)

In [None]:
predictions_folder = '../Results/CoT/gpt3' 
ground_truth_folder = '../db/eval_labelled'
evaluate_folder(predictions_folder, ground_truth_folder)