# Evaluation of ChatGPT Baseline on CLAQUA corpus

In [42]:
import pandas as pd
import numpy as np

In [43]:
cd /content/drive/MyDrive/thesis/chat-gpt_baseline

/content/drive/MyDrive/thesis/chat-gpt_baseline


## Calculate metrics
Metrics used for evaluation:
- Accuracy
- Precision
- Recall
- F1-score

In [44]:
def get_true_positives(df, prompt_label):
  true_positives = len(df[(df["label"]==1) &
                      (df[prompt_label]==1)])  # True positives
  return true_positives

def get_true_negatives(df, prompt_label):
  true_negatives = len(df[(df["label"]==0) &
                      (df[prompt_label]==0)])  # True negatives
  return true_negatives

def get_false_positives(df, prompt_label):
  false_positives = len(df[(df["label"]==0) &
                           (df[prompt_label]==1)])  # False positives
  return false_positives

def get_false_negatives(df, prompt_label):
  false_negatives = len(df[(df["label"]==1) &
                       (df[prompt_label]==0)])  # False negatives
  return false_negatives

def calculate_accuracy(true_positives, false_positives, true_negatives, false_negatives):
  return (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)

def calculate_precision(true_positives, false_positives):
  return true_positives / (true_positives + false_positives)

def calculate_recall(true_positives, false_negatives):
  return true_positives / (true_positives + false_negatives)

def calculate_f1_score(precision, recall):
  return 2 * (precision * recall) / (precision+recall)

In [45]:
def evaluate(df, prompt_eval):
  TP = get_true_positives(df, prompt_eval)
  FP = get_false_positives(df, prompt_eval)
  TN = get_true_negatives(df, prompt_eval)
  FN = get_false_negatives(df, prompt_eval)

  accuracy = calculate_accuracy(TP, FP, TN, FN)
  precision = calculate_precision(TP, FP)
  recall = calculate_recall(TP, FN)
  f1_score = calculate_f1_score(precision, recall)

  print(f"\t Accuracy: {accuracy}")
  print(f"\t Precision: {precision}")
  print(f"\t Recall: {recall}")
  print(f"\t F1 score: {f1_score}")

  return accuracy, precision, recall, f1_score

In [46]:
def evaluate_explanations(df, prompt_eval_explanation_based):
  # calculate percentage of correct explanations (only for the correctly predicted labels)
  return len(df[df[prompt_eval_explanation_based]=="correct"]) / (len(df[df[prompt_eval_explanation_based]=="correct"]) + len(df[df[prompt_eval_explanation_based]=="incorrect"]) )

In [47]:
def evaluate_hallucinations_omissions_incoherence(df, prompt_eval_type):
  # calculate percentage of hallucinations / omissions / incoherence / focus deviations
  return len(df[df[prompt_eval_type]==1]) / (len(df[df[prompt_eval_type]==0]) + len(df[df[prompt_eval_type]==1]) )

## Single-turn Results Evaluation

In [48]:
single_turn_gpt_results_path = './single_turn_test_sample_evaluated.csv'

In [49]:
from ast import literal_eval
single_turn_gpt_results = pd.read_csv(single_turn_gpt_results_path, delimiter=';', header=0, converters={'attrs_entity1' : literal_eval, 'attrs_entity2' : literal_eval})
single_turn_gpt_results = single_turn_gpt_results.drop(columns=['Unnamed: 0','Unnamed: 0.1', 'Unnamed: 0.2', 'Unnamed: 0.3', 'Unnamed: 0.4', 'Unnamed: 0.5'])

In [50]:
single_turn_gpt_results

Unnamed: 0,sample_index,corpus_index,label,context,entity1,entity2,prompt2_out,prompt2_label,prompt2_eval_label-based,prompt2_eval_explanation-based,...,prompt1_eval_hallucination,prompt1_eval_omission,prompt1_eval_incoherence,prompt2_eval_hallucination,prompt2_eval_omission,prompt2_eval_incoherence,prompt1_eval_focus-deviation,prompt2_eval_focus-deviation,attrs_entity1,attrs_entity2
0,0,661,0,specify the name of the release of Farewell?,Farewell Farewell is a live album by jazz com...,Farewell Farewell is an American pop punk ban...,The user question needs a clarification reques...,1,incorrect,-,...,0,1,1,1,1,0,1,0,"[commerce.product, media_common.cataloged_inst...","[event.agent, internet.social_network_user, me..."
1,1,427,1,First person to summit Mount Olympus,"Mount Olympus Mount Olympus (/oʊˈlɪmpəs, əˈlɪ...","Mount Olympus Mount Olympus, at 7,980 feet, i...",The user question needs a clarification reques...,1,correct,correct,...,0,1,0,0,0,0,1,0,"[location.location, fictional_universe.setting...","[geography.geographical_feature, geography.mou..."
2,2,192,0,state the sports statistics of Will Brooks,"Will Brooks Will Brooks is an actor, known fo...",Will Brooks mixed martial artist,The user question needs a clarification reques...,1,incorrect,-,...,0,0,0,0,0,0,1,0,"[biology.organism, event.agent, film.actor, me...","[martial_arts.martial_artist, media_common.cat..."
3,3,209,0,mention where Palookaville can be purchased on...,Palookaville Palookaville is a comic book wri...,Palookaville Palookaville is the fourth studi...,The user question needs a clarification reques...,1,incorrect,-,...,0,0,0,1,0,0,0,0,"[book.written_work, comic_books.series]","[award.nominated_work, commerce.consumer_produ..."
4,4,738,0,What was the set location of Blood of Man,Blood of Man Blood of Man is the eighth album...,"Blood of Man Charlie, a compulsive liar, and ...",The user question needs a clarification reques...,1,incorrect,-,...,1,1,0,0,1,0,1,0,"[media_common.cataloged_instance, music.album,...","[media_common.creative_work, film.film, media_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,482,0,What is the name of the musical composition wh...,Band of the Hand Band of the Hand is an Ameri...,Band of the Hand Band of the Hand is the 1986...,The user question does not need a clarificatio...,0,correct,incorrect,...,0,0,0,0,1,1,1,0,"[media_common.creative_work, ratings.rated_ent...","[media_common.creative_work, music.album, musi..."
96,96,764,0,Which software was developed by Rusty Russell?,Rusty Russell William Rusty Russell (born Aug...,Rusty Russell Paul Rusty Russell is an Austra...,The user question needs a clarification reques...,1,incorrect,-,...,0,0,0,0,1,0,1,0,"[media_common.cataloged_instance, people.perso...","[computer.software_developer, people.person, b..."
97,97,328,1,What video corresponds to How the States Got T...,How the States Got Their Shapes Correspondent...,How the States Got Their Shapes Every shape o...,The user question does not need a clarificatio...,0,incorrect,-,...,0,1,0,1,1,0,1,0,"[tv.program, ratings.rated_entity, media_commo...","[media_common.creative_work, ratings.rated_ent..."
98,98,156,0,what is the creative work about The Crimean War?,"The Crimean War The Crimean War, also known i...","The Crimean War During the Crimean War, for t...",The user question does not need a clarificatio...,0,correct,incorrect,...,0,0,0,0,1,0,1,0,"[book.subject, film.subject, media_common.subj...","[media_common.creative_work, ratings.rated_ent..."


In [51]:
num_negative_items = len(single_turn_gpt_results[single_turn_gpt_results["label"]==0])  # Number of corpus items without clarification need
num_positive_items = len(single_turn_gpt_results[single_turn_gpt_results["label"]==1])  # Number of corpus items with clarification need
num_total = len(single_turn_gpt_results)
print(num_total)

100


In [52]:
print("Single-turn Evaluation")
print('\n')
# Label evaluation
print(f"Prompt0 Evaluation Label-based")
prompt0_yn_eval_acc, prompt0_yn_eval_precision, prompt0_yn_eval_recall, prompt0_yn_eval_f1 = evaluate(single_turn_gpt_results, "prompt0_label")
print(f"Prompt1 Evaluation Label-based")
prompt1_yn_eval_acc, prompt1_yn_eval_precision, prompt1_yn_eval_recall, prompt1_yn_eval_f1 = evaluate(single_turn_gpt_results, "prompt1_label")
print(f"Prompt2 Evaluation Label-based")
prompt2_yn_eval_acc, prompt2_yn_eval_precision, prompt2_yn_eval_recall, prompt2_yn_eval_f1 = evaluate(single_turn_gpt_results, "prompt2_label")

print('\n')
# Explanation evaluation
prompt1_correct_expl = evaluate_explanations(single_turn_gpt_results, "prompt1_eval_explanation-based")
print(f"Prompt1 Evaluation Explanation-based \n\t Percentage of correct explanations (only for the correctly predicted labels): {prompt1_correct_expl:.1%}")
prompt2_correct_expl = evaluate_explanations(single_turn_gpt_results, "prompt2_eval_explanation-based")
print(f"Prompt2 Evaluation Explanation-based \n\t Percentage of correct explanations (only for the correctly predicted labels): {prompt2_correct_expl:.1%}")

print('\n')
# Prompt hallucination / omission / incoherence / focus deviation evaluation
prompt1_focus_deviations = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt1_eval_focus-deviation')
print(f"Prompt1 Evaluation \n\t Percentage of focus deviations: {prompt1_focus_deviations:.1%}")
prompt1_hallucinations = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt1_eval_hallucination')
print(f"\t Percentage of hallucinations: {prompt1_hallucinations:.1%}")
prompt1_omissions = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt1_eval_omission')
print(f"\t Percentage of omissions: {prompt1_omissions:.1%}")
prompt1_incoherence = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt1_eval_incoherence')
print(f"\t Percentage of incoherences: {prompt1_incoherence:.1%}")
prompt2_focus_deviations = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt2_eval_focus-deviation')
print(f"Prompt2 Evaluation \n\t Percentage of focus deviations: {prompt2_focus_deviations:.1%}")
prompt2_hallucinations = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt2_eval_hallucination')
print(f"\t Percentage of hallucinations: {prompt2_hallucinations:.1%}")
prompt2_omissions = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt2_eval_omission')
print(f"\t Percentage of omissions: {prompt2_omissions:.1%}")
prompt2_incoherence = evaluate_hallucinations_omissions_incoherence(single_turn_gpt_results, 'prompt2_eval_incoherence')
print(f"\t Percentage of incoherences: {prompt2_incoherence:.1%}")

Single-turn Evaluation


Prompt0 Evaluation Label-based
	 Accuracy: 0.48
	 Precision: 0.38636363636363635
	 Recall: 0.40476190476190477
	 F1 score: 0.3953488372093023
Prompt1 Evaluation Label-based
	 Accuracy: 0.41
	 Precision: 0.4105263157894737
	 Recall: 0.9285714285714286
	 F1 score: 0.5693430656934306
Prompt2 Evaluation Label-based
	 Accuracy: 0.46
	 Precision: 0.35714285714285715
	 Recall: 0.35714285714285715
	 F1 score: 0.35714285714285715


Prompt1 Evaluation Explanation-based 
	 Percentage of correct explanations (only for the correctly predicted labels): 22.5%
Prompt2 Evaluation Explanation-based 
	 Percentage of correct explanations (only for the correctly predicted labels): 56.5%


Prompt1 Evaluation 
	 Percentage of focus deviations: 91.0%
	 Percentage of hallucinations: 9.0%
	 Percentage of omissions: 36.0%
	 Percentage of incoherences: 5.0%
Prompt2 Evaluation 
	 Percentage of focus deviations: 3.0%
	 Percentage of hallucinations: 18.0%
	 Percentage of omissions: 64.0%
	 P

#### Calculate metrics only for examples which are not unclear
Some corpus items have gold labels which are not self-explanatory. These are labeled as "unclear" and in the following taken out from the dataset for evaluation.

In [None]:
single_turn_gpt_results_clear =  single_turn_gpt_results[single_turn_gpt_results["label_comment"]!="unclear"]
print(f"Number of items in test subset single-turn with verifiable label given entity text descriptions: {len(single_turn_gpt_results_clear)}")

Number of items in test subset single-turn with verifiable label given entity text descriptions: 66


In [None]:
print(f"Prompt1 Evaluation Yes-no-based (only test items with verifiable label given entity text descriptions)")
prompt1_yn_eval_acc, prompt1_yn_eval_precision, prompt1_yn_eval_recall, prompt1_yn_eval_f1 = evaluate(single_turn_gpt_results_clear, "prompt1_label")
print(f"Prompt2 Evaluation Yes-no-based (only test items with verifiable label given entity text descriptions)")
prompt2_yn_eval_acc, prompt2_yn_eval_precision, prompt2_yn_eval_recall, prompt2_yn_eval_f1 = evaluate(single_turn_gpt_results_clear, "prompt2_label")

Prompt1 Evaluation Yes-no-based (only test items with verifiable label given entity text descriptions)
	 Accuracy: 0.5757575757575758
	 Precision: 0.5967741935483871
	 Recall: 0.925
	 F1 score: 0.7254901960784315
Prompt2 Evaluation Yes-no-based (only test items with verifiable label given entity text descriptions)
	 Accuracy: 0.3939393939393939
	 Precision: 0.5
	 Recall: 0.325
	 F1 score: 0.393939393939394


#### Calculate number of prompt1 responses for positive clarification need

In [53]:
num_positive_predicted_prompt1 = len(single_turn_gpt_results[single_turn_gpt_results['prompt1_out'].str.contains('^Yes')])
num_negative_predicted_prompt1 = len(single_turn_gpt_results[single_turn_gpt_results['prompt1_out'].str.contains('^No',regex=True)])
assert num_positive_predicted_prompt1 + num_negative_predicted_prompt1 == len(single_turn_gpt_results) # check for double counting
print(f"Single-turn: Percentage of prompt1 responses predicting clarification need (class 1): {(num_positive_predicted_prompt1/len(single_turn_gpt_results))*100}%")
print(f"Single-turn: Percentage of prompt1 responses predicting no clarification need (class 0): {(num_negative_predicted_prompt1/len(single_turn_gpt_results))*100}%")

Single-turn: Percentage of prompt1 responses predicting clarification need (class 1): 95.0%
Single-turn: Percentage of prompt1 responses predicting no clarification need (class 0): 5.0%


## Multi-turn Results Evaluation

In [54]:
multi_turn_gpt_results_path = './multi_turn_test_sample_evaluated.csv'

In [55]:
multi_turn_gpt_results = pd.read_csv(multi_turn_gpt_results_path, delimiter=';', header=0, converters={'attrs_entity1' : literal_eval, 'attrs_entity2' : literal_eval})
multi_turn_gpt_results = multi_turn_gpt_results.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2', 'Unnamed: 0.3'])

In [56]:
num_negative_items = len(multi_turn_gpt_results[multi_turn_gpt_results["label"]==0])  # Number of corpus items without clarification need
num_positive_items = len(multi_turn_gpt_results[multi_turn_gpt_results["label"]==1])  # Number of corpus items with clarification need
num_total = len(multi_turn_gpt_results)
print(num_total)

100


In [57]:
print("Multi-turn Evaluation")
print('\n')
# Label evaluation
print(f"Prompt0 Evaluation Label-based")
prompt0_yn_eval_acc, prompt0_yn_eval_precision, prompt0_yn_eval_recall, prompt0_yn_eval_f1 = evaluate(multi_turn_gpt_results, "prompt0_label")
print(f"Prompt1 Evaluation Label-based")
prompt1_yn_eval_acc, prompt1_yn_eval_precision, prompt1_yn_eval_recall, prompt1_yn_eval_f1 = evaluate(multi_turn_gpt_results, "prompt1_label")
print(f"Prompt2 Evaluation Label-based")
prompt2_yn_eval_acc, prompt2_yn_eval_precision, prompt2_yn_eval_recall, prompt2_yn_eval_f1 = evaluate(multi_turn_gpt_results, "prompt2_label")
print(f"Prompt3 Evaluation Label-based")
prompt3_yn_eval_acc, prompt3_yn_eval_precision, prompt3_yn_eval_recall, prompt3_yn_eval_f1 = evaluate(multi_turn_gpt_results, "prompt3_label")

print('\n')
# Explanation evaluation
prompt1_correct_expl = evaluate_explanations(multi_turn_gpt_results, "prompt1_eval_explanation-based")
print(f"Prompt1 Evaluation Explanation-based \n\t Percentage of correct explanations (only for the correctly predicted labels): {prompt1_correct_expl:.1%}")
prompt2_correct_expl = evaluate_explanations(multi_turn_gpt_results, "prompt2_eval_explanation-based")
print(f"Prompt2 Evaluation Explanation-based \n\t Percentage of correct explanations (only for the correctly predicted labels): {prompt2_correct_expl:.1%}")
prompt3_correct_expl = evaluate_explanations(multi_turn_gpt_results, "prompt3_eval_explanation-based")
print(f"Prompt3 Evaluation Explanation-based \n\t Percentage of correct explanations (only for the correctly predicted labels): {prompt3_correct_expl:.1%}")

print('\n')
# Prompt hallucination / omission / incoherence / focus deviation evaluation
prompt1_focus_deviations = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt1_eval_focus-deviation')
print(f"Prompt1 Evaluation \n\t Percentage of focus deviations: {prompt1_focus_deviations:.1%}")
prompt1_hallucinations = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt1_eval_hallucination')
print(f"\t Percentage of hallucinations: {prompt1_hallucinations:.1%}")
prompt1_omissions = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt1_eval_omission')
print(f"\t Percentage of omissions: {prompt1_omissions:.1%}")
prompt1_incoherence = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt1_eval_incoherence')
print(f"\t Percentage of incoherences: {prompt1_incoherence:.1%}")
prompt2_focus_deviations = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt2_eval_focus-deviation')
print(f"Prompt2 Evaluation \n\t Percentage of focus deviations: {prompt2_focus_deviations:.1%}")
prompt2_hallucinations = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt2_eval_hallucination')
print(f"\t Percentage of hallucinations: {prompt2_hallucinations:.1%}")
prompt2_omissions = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt2_eval_omission')
print(f"\t Percentage of omissions: {prompt2_omissions:.1%}")
prompt2_incoherence = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt2_eval_incoherence')
print(f"\t Percentage of incoherences: {prompt2_incoherence:.1%}")
prompt3_focus_deviations = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt3_eval_focus-deviation')
print(f"Prompt2 Evaluation \n\t Percentage of focus deviations: {prompt3_focus_deviations:.1%}")
prompt3_hallucinations = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt3_eval_hallucination')
print(f"\t Percentage of hallucinations: {prompt3_hallucinations:.1%}")
prompt3_omissions = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt3_eval_omission')
print(f"\t Percentage of omissions: {prompt3_omissions:.1%}")
prompt3_incoherence = evaluate_hallucinations_omissions_incoherence(multi_turn_gpt_results, 'prompt3_eval_incoherence')
print(f"\t Percentage of incoherences: {prompt3_incoherence:.1%}")

Multi-turn Evaluation


Prompt0 Evaluation Label-based
	 Accuracy: 0.56
	 Precision: 0.8888888888888888
	 Recall: 0.1568627450980392
	 F1 score: 0.26666666666666666
Prompt1 Evaluation Label-based
	 Accuracy: 0.52
	 Precision: 0.5151515151515151
	 Recall: 1.0
	 F1 score: 0.6799999999999999
Prompt2 Evaluation Label-based
	 Accuracy: 0.49
	 Precision: 0.5
	 Recall: 0.6666666666666666
	 F1 score: 0.5714285714285715
Prompt3 Evaluation Label-based
	 Accuracy: 0.47
	 Precision: 0.47368421052631576
	 Recall: 0.35294117647058826
	 F1 score: 0.40449438202247195


Prompt1 Evaluation Explanation-based 
	 Percentage of correct explanations (only for the correctly predicted labels): 9.6%
Prompt2 Evaluation Explanation-based 
	 Percentage of correct explanations (only for the correctly predicted labels): 30.6%
Prompt3 Evaluation Explanation-based 
	 Percentage of correct explanations (only for the correctly predicted labels): 41.3%


Prompt1 Evaluation 
	 Percentage of focus deviations: 94.0%
	 Perce

#### Calculate metrics only for examples which are not unclear
Some corpus items have gold labels which are not self-explanatory. These are labeled as "unclear" and in the following taken out from the dataset for evaluation.

In [None]:
multi_turn_gpt_results_clear = multi_turn_gpt_results[multi_turn_gpt_results["label_comment"]!="unclear"]
print(f"Number of items in test subset multi-turn with verifiable label given entity text descriptions: {len(multi_turn_gpt_results_clear)}")

Number of items in test subset multi-turn with verifiable label given entity text descriptions: 76


In [None]:
print(f"Prompt1 Evaluation Label-based (only test items with verifiable label given entity text descriptions)")
prompt1_yn_eval_acc, prompt1_yn_eval_precision, prompt1_yn_eval_recall, prompt1_yn_eval_f1 = evaluate(multi_turn_gpt_results_clear, "prompt1_label")
print(f"Prompt2 Evaluation Label-based (only test items with verifiable label given entity text descriptions)")
prompt2_yn_eval_acc, prompt2_yn_eval_precision, prompt2_yn_eval_recall, prompt2_yn_eval_f1 = evaluate(multi_turn_gpt_results_clear, "prompt2_label")
print(f"Prompt3 Evaluation Label-based (only test items with verifiable label given entity text descriptions)")
prompt3_yn_eval_acc, prompt3_yn_eval_precision, prompt3_yn_eval_recall, prompt3_yn_eval_f1 = evaluate(multi_turn_gpt_results_clear, "prompt3_label")

Prompt1 Evaluation Label-based (only test items with verifiable label given entity text descriptions)
	 Accuracy: 0.6447368421052632
	 Precision: 0.64
	 Recall: 1.0
	 F1 score: 0.7804878048780487
Prompt2 Evaluation Label-based (only test items with verifiable label given entity text descriptions)
	 Accuracy: 0.5333333333333333
	 Precision: 0.625
	 Recall: 0.6382978723404256
	 F1 score: 0.631578947368421
Prompt3 Evaluation Label-based (only test items with verifiable label given entity text descriptions)
	 Accuracy: 0.4342105263157895
	 Precision: 0.5925925925925926
	 Recall: 0.3333333333333333
	 F1 score: 0.4266666666666667


#### Calculate number of prompt1 responses for positive clarification need

In [58]:
num_positive_predicted_prompt1 = len(multi_turn_gpt_results[multi_turn_gpt_results['prompt1_out'].str.contains('^Yes')])
num_negative_predicted_prompt1 = len(multi_turn_gpt_results[multi_turn_gpt_results['prompt1_out'].str.contains('^No',regex=True)])
assert num_positive_predicted_prompt1 + num_negative_predicted_prompt1 == len(multi_turn_gpt_results) # check for double counting
print(f"Multi-turn: Percentage of prompt1 responses predicting clarification need (class 1): {(num_positive_predicted_prompt1/len(multi_turn_gpt_results))*100}%")
print(f"Multi-turn: Percentage of prompt1 responses predicting no clarification need (class 0): {(num_negative_predicted_prompt1/len(multi_turn_gpt_results))*100}%")

Multi-turn: Percentage of prompt1 responses predicting clarification need (class 1): 99.0%
Multi-turn: Percentage of prompt1 responses predicting no clarification need (class 0): 1.0%


In [59]:
multi_turn_gpt_results[multi_turn_gpt_results['prompt1_out'].str.contains('^No',regex=True)]

Unnamed: 0,Unnamed: 0.4,sample_index,corpus_index,label,context,entity1,entity2,prompt1_out,prompt1_label,prompt1_eval_label-based,...,prompt2_eval_focus-deviation,prompt2_eval_hallucination,prompt2_eval_omission,prompt2_eval_incoherence,prompt3_eval_focus-deviation,prompt3_eval_hallucination,prompt3_eval_omission,prompt3_eval_incoherence,attrs_entity1,attrs_entity2
28,28,28,736,0,What is sequel of real steel <EOS> Real steel ...,Real Steel A struggling fighter-turned-promot...,Real steel 2 Real Steel 2 is an upcoming scie...,"No, the user question does not need a clarific...",0,correct,...,1,0,0,0,1,0,1,0,"[media_common.adaptation, broadcast.content, a...","[ratings.rated_entity, film.film, media_common..."


# Random baseline
Set up a random baseline on the data for result comparison.

Sklearn Dummy Classifier see documentation: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
Strategies used:
- “uniform”: generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.
- “most_frequent”: the predict method always returns the most frequent class label in the observed y argument passed to fit

In [60]:
from sklearn.dummy import DummyClassifier

### Single-turn

In [61]:
x_values_pandas_single_turn = single_turn_gpt_results["context"] + " " + single_turn_gpt_results["entity1"] + " " + single_turn_gpt_results["entity2"]
x_values_single_turn = x_values_pandas_single_turn.to_numpy()
y_values_single_turn = single_turn_gpt_results["label"].to_numpy()

# create dummy classifier
dummy_clf_uniform = DummyClassifier(strategy='uniform', random_state=42)
dummy_clf_mostfreq = DummyClassifier(strategy='most_frequent', random_state=42)
# train a model
dummy_clf_uniform.fit(x_values_single_turn, y_values_single_turn)
dummy_clf_mostfreq.fit(x_values_single_turn, y_values_single_turn)
# get accuracy score
print(f"Dummy classifier on single-turn data, strategy uniform, accuracy: {dummy_clf_uniform.score(x_values_single_turn, y_values_single_turn)}")
print(f"Dummy classifier on single-turn data, strategy most-frequent, accuracy: {dummy_clf_mostfreq.score(x_values_single_turn, y_values_single_turn)}")

Dummy classifier on single-turn data, strategy uniform, accuracy: 0.46
Dummy classifier on single-turn data, strategy most-frequent, accuracy: 0.58


### Multi-turn

In [None]:
x_values_pandas_multi_turn = multi_turn_gpt_results["context"] + " " + multi_turn_gpt_results["entity1"] + " " + multi_turn_gpt_results["entity2"]
x_values_multi_turn = x_values_pandas_multi_turn.to_numpy()
y_values_multi_turn = multi_turn_gpt_results["label"].to_numpy()

# create dummy classifier
dummy_clf_uniform = DummyClassifier(strategy='uniform', random_state=42)
dummy_clf_mostfreq = DummyClassifier(strategy='most_frequent', random_state=42)
# train a model
dummy_clf_uniform.fit(x_values_multi_turn, y_values_multi_turn)
dummy_clf_mostfreq.fit(x_values_multi_turn, y_values_multi_turn)
# get accuracy score
print(f"Dummy classifier on single-turn data, strategy uniform, accuracy: {dummy_clf_uniform.score(x_values_multi_turn, y_values_multi_turn)}")
print(f"Dummy classifier on single-turn data, strategy most-frequent, accuracy: {dummy_clf_mostfreq.score(x_values_multi_turn, y_values_multi_turn)}")

Dummy classifier on single-turn data, strategy uniform, accuracy: 0.53
Dummy classifier on single-turn data, strategy most-frequent, accuracy: 0.51


# Entity type evaluation
Evaluate for which entity types the GPT performs best.

In [65]:
def get_entity_type_cnts(df, prompt_label, entity_cnts, classified_correct):
  if classified_correct: # GPT classified label correct
    correct_items = df[df['label'] == df[prompt_label]]
    print("Corpus items correctly classified by GPT prompt continuations")
  else: # GPT classified label incorrect
    correct_items = df[df['label'] != df[prompt_label]]
    print("Corpus items incorrectly classified by GPT prompt continuations")
  avg_num_attrs = 0
  for idx in range(len(correct_items)):
    avg_num_attrs += ((len(correct_items.iloc[idx]['attrs_entity1']) + len(correct_items.iloc[idx]['attrs_entity2'])) / 2)
    for attr in correct_items.iloc[idx]['attrs_entity1']:
      if attr in entity_cnts:
        entity_cnts[attr] += 1
      else:
        entity_cnts[attr] = 1
    for attr in correct_items.iloc[idx]['attrs_entity2']:
      if attr in entity_cnts:
        entity_cnts[attr] += 1
      else:
        entity_cnts[attr] = 1
  # sort dict by value
  entity_cnts_sorted = {k: v for k, v in sorted(entity_cnts.items(), key=lambda item: item[1], reverse=True)}
  print(f"\t Average number of attributes per corpus item: {avg_num_attrs / len(correct_items)}")
  print(f"\t Number of distinct KB attributes: {len(entity_cnts_sorted.keys())}")
  print(f"\t Total number of occurrences: {sum(entity_cnts_sorted.values())}")
  return entity_cnts_sorted

In [66]:
def evaluate_categories_for_entity_attributes(datasplit, df, prompt):
  print(f"--- Evaluation for {datasplit} data with {prompt} --- \n")

  # store entity attributes which were correctly classified with GPT prompt
  correctly_classified_entity_attrs = {}
  correctly_classified_entity_attrs_sorted = get_entity_type_cnts(df, prompt, correctly_classified_entity_attrs, classified_correct=True)

  # store entity attributes which were incorrectly classified with GPT prompt
  incorrectly_classified_entity_attrs = {}
  incorrectly_classified_entity_attrs_sorted = get_entity_type_cnts(df, prompt, incorrectly_classified_entity_attrs, classified_correct=False)

  # check which KB attributes were correctly classified and which were not
  print("KB attributes which were only correctly classified:")
  attrs_only_correctly_classified = {k:v for k,v in correctly_classified_entity_attrs_sorted.items() if k not in incorrectly_classified_entity_attrs_sorted}
  attrs_only_correctly_classified_sorted = {k: v for k, v in sorted(attrs_only_correctly_classified.items(), key=lambda item: item[1], reverse=True)}
  print(attrs_only_correctly_classified_sorted)

  print("KB attributes which were only incorrectly classified:")
  attrs_only_incorrectly_classified = {k:v for k,v in incorrectly_classified_entity_attrs_sorted.items() if k not in correctly_classified_entity_attrs_sorted}
  attrs_only_incorrectly_classified_sorted = {k: v for k, v in sorted(attrs_only_incorrectly_classified.items(), key=lambda item: item[1], reverse=True)}
  print(attrs_only_incorrectly_classified_sorted)

  print("Top 10 correctly classified entity attributes:")
  total_num_attrs = sum(correctly_classified_entity_attrs_sorted.values())
  correctly = {k: v / total_num_attrs for k, v in correctly_classified_entity_attrs_sorted.items()}
  print(dict(list(correctly.items())[0: 10]))

  print("Top 10 incorrectly classified entity attributes:")
  total_num_attrs = sum(incorrectly_classified_entity_attrs_sorted.values())
  incorrectly = {k: v / total_num_attrs for k, v in incorrectly_classified_entity_attrs_sorted.items()}
  print(dict(list(incorrectly.items())[0: 10]))
  print('\n')

## Single-turn

In [67]:
evaluate_categories_for_entity_attributes('single-turn', single_turn_gpt_results, 'prompt0_label')
evaluate_categories_for_entity_attributes('single-turn', single_turn_gpt_results, 'prompt1_label')
evaluate_categories_for_entity_attributes('single-turn', single_turn_gpt_results, 'prompt2_label')

--- Evaluation for single-turn data with prompt0_label --- 

Corpus items correctly classified by GPT prompt continuations
	 Average number of attributes per corpus item: 5.791666666666667
	 Number of distinct KB attributes: 148
	 Total number of occurrences: 556
Corpus items incorrectly classified by GPT prompt continuations
	 Average number of attributes per corpus item: 5.451923076923077
	 Number of distinct KB attributes: 146
	 Total number of occurrences: 567
KB attributes which were only correctly classified:
{'geography.mountain': 6, 'government.politician': 3, 'medicine.drug': 3, 'book.character': 2, 'tv.character': 2, 'government.u_s_congressperson': 2, 'music.soundtrack': 2, 'tv.crewmember': 2, 'book.subject': 2, 'film.subject': 2, 'education.educational_institution.school_type': 2, 'school"': 2, 'education.school': 2, 'film.director': 2, 'tv.director': 2, 'location.admin_division_2': 2, 'location.us_county': 2, 'medicine.medical_treatment': 2, 'medicine.medical_procedure': 2

## Multi-turn

In [68]:
evaluate_categories_for_entity_attributes('multi-turn', multi_turn_gpt_results, 'prompt0_label')
evaluate_categories_for_entity_attributes('multi-turn', multi_turn_gpt_results, 'prompt1_label')
evaluate_categories_for_entity_attributes('multi-turn', multi_turn_gpt_results, 'prompt2_label')
evaluate_categories_for_entity_attributes('multi-turn', multi_turn_gpt_results, 'prompt3_label')

--- Evaluation for multi-turn data with prompt0_label --- 

Corpus items correctly classified by GPT prompt continuations
	 Average number of attributes per corpus item: 8.633928571428571
	 Number of distinct KB attributes: 165
	 Total number of occurrences: 967
Corpus items incorrectly classified by GPT prompt continuations
	 Average number of attributes per corpus item: 10.022727272727273
	 Number of distinct KB attributes: 76
	 Total number of occurrences: 882
KB attributes which were only correctly classified:
{'astronomy.celestial_object': 10, 'astronomy.orbital_relationship': 10, 'conferences.conference_subject': 6, 'business.employer': 5, 'business.operation': 5, 'organization.organization': 5, 'astronomy.celestial_object_with_coordinate_system': 5, 'astronomy.star': 5, 'astronomy.star_system_body': 5, 'astronomy.asteroid': 5, 'astronomy.astronomical_discovery': 5, 'tv.program': 5, 'cvg.developer': 4, 'computer.software_developer': 4, 'cvg.publisher': 4, 'games.publisher': 4, 'm