In [1]:
#Read completion data
import json

def prep_completions(completion_file):
    completion_list = [] #output 2d list of prompt completions
    with open(completion_file, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, start=1):
            try:
                data = json.loads(line)
                passage_set = []
                completions = data['completions']
                for completion in completions:
                    passage =  json.loads(completion)['message']['content']
                    passage_set.append(passage)
                completion_list.append(passage_set)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number}: {e}")
    return completion_list

privacyqa_completion_file = 'privacyqa_generated_responses.jsonl'
privacyqa_completion_file_b = 'privacyqa_generated_responses_b.jsonl'
cancerqa_completion_file = 'cancerqa_generated_responses.jsonl'
cancerqa_completion_file_b = 'cancerqa_generated_responses_b.jsonl'
cancerq_privacya_completion_file = 'cancerq_privacya_generated_responses.jsonl'
privacyq_cancera_completion_file = 'privacyq_cancera_generated_responses.jsonl'
chatgpt_completion_file = "chatgpt_generated_responses_b.jsonl"
chatgpt_privacya_completion_file = "chatgptq_privacya_generated_responses.jsonl"
chatgpt_cancera_completion_file = "chatgptq_cancera_generated_responses.jsonl"

#privacyqa_completion_list = prep_completions(privacyqa_completion_file)
completion_list = prep_completions(chatgpt_cancera_completion_file)

print(f"Created list of {len(completion_list)} completion sets")


Created list of 300 completion sets


In [2]:
#Collect sentence level BERTScore
import torch
import time
import spacy
from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore
from bert_score import score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("using device ", device)
selfcheck_bertscore = SelfCheckBERTScore(rescale_with_baseline=True)
nlp = spacy.load("en_core_web_sm")

privacyqa_results_file='privacyqa_bertscore_results.txt'
privacyqa_results_file_b='privacyqa_bertscore_results_b.txt'
cancerqa_results_file='cancerqa_bertscore_results.txt'
cancerqa_results_file_b='cancerqa_bertscore_results_b.txt'
cancerq_privacya_results_file='cancerq_privacya_bertscore_results.txt'
privacyq_cancera_results_file='privacyq_cancera_bertscore_results.txt'
chatgpt_results_file='chatgpt_bertscore_results.txt'
chatgpt_privacya_results_file='chatgpt_privacya_bertscore_results.txt'
chatgpt_cancera_results_file='chatgpt_cancera_bertscore_results.txt'


def doBERT(completion_list, output_path):
    results=[]
    with open(output_path, 'w') as outfile:
        for passages in completion_list:
            try:
                sentences = [sent.text.strip() for sent in nlp(passages[0]).sents] # spacy sentence tokenization
                sent_scores_bertscore = selfcheck_bertscore.predict(
                    sentences = sentences,             # list of sentences
                    sampled_passages = passages, # list of sampled passages
                )
                results.append(sent_scores_bertscore)
                line_array = ' '.join(map(str, sent_scores_bertscore))
                outfile.write(line_array+ '\n')
            except Exception as e:
                print(f"Error processing BERTScore: \n{e}")
                time.sleep(.1)  # Brief pause in case
    return results
   
score_results = doBERT(completion_list, chatgpt_cancera_results_file)




using device  cpu
SelfCheck-BERTScore initialized
Error processing BERTScore: 
list index out of range


In [3]:
#Get Mean BERTScore
import numpy as np

def mean_score(filename):
    averages_list=[]
    with open(filename, 'r') as file:
        for line in file:
            numbers_str = line.strip().split(' ')
            numbers = np.array(numbers_str, dtype=float)
            if numbers.size > 0:
                line_average = np.mean(numbers) 
                averages_list.append(line_average)
        if averages_list:
            mean = np.mean(averages_list) 
        else:
            mean = 0
    return mean

privacyqa_results_file='privacyqa_bertscore_results.txt'
privacyqa_results_file_b='privacyqa_bertscore_results_b.txt'
cancerqa_results_file='cancerqa_bertscore_results.txt'
cancerqa_results_file_b='cancerqa_bertscore_results_b.txt'
cancerq_privacya_results_file='cancerq_privacya_bertscore_results.txt'
privacyq_cancera_results_file='privacyq_cancera_bertscore_results.txt'
chatgpt_results_file='chatgpt_bertscore_results.txt'
chatgpt_privacya_results_file='chatgpt_privacya_bertscore_results.txt'
chatgpt_cancera_results_file='chatgpt_cancera_bertscore_results.txt'

print("Mean privacyqa score: ",mean_score(privacyqa_results_file))
print("Mean privacyqa baseline score: ",mean_score(privacyqa_results_file_b))
print("Mean cancerqa score: ",mean_score(cancerqa_results_file))
print("Mean cancerqa baselinescore: ",mean_score(cancerqa_results_file_b))
print("Mean cancerq on privacy model score: ",mean_score(cancerq_privacya_results_file))
print("Mean privacyq on cancer model score: ",mean_score(privacyq_cancera_results_file))
print("Mean chatgpt baseline model score: ",mean_score(chatgpt_results_file))
print("Mean chatgpt privacy model score: ",mean_score(chatgpt_privacya_results_file))
print("Mean chatgpt cancer model score: ",mean_score(chatgpt_cancera_results_file))



Mean privacyqa score:  0.49318810144564745
Mean privacyqa baseline score:  0.4259576255342392
Mean cancerqa score:  0.3730463860646061
Mean cancerqa baselinescore:  0.4458717826296549
Mean cancerq on privacy model score:  0.4092485118156171
Mean privacyq on cancer model score:  0.5439597276587225
Mean chatgpt baseline model score:  0.4365593959120497
Mean chatgpt privacy model score:  0.4214101734642622
Mean chatgpt cancer model score:  0.5129708493690874
