In [57]:
#Read completion data
import json

def prep_completions(completion_file):
    completion_list = [] #output 2d list of prompt completions
    with open(completion_file, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, start=1):
            try:
                data = json.loads(line)
                passage_set = []
                completions = data['completions']
                for completion in completions:
                    passage =  json.loads(completion)['message']['content']
                    passage_set.append(passage)
                completion_list.append(passage_set)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number}: {e}")
    return completion_list

privacyqa_completion_file = 'privacyqa_generated_responses.jsonl'
privacyqa_completion_file_b = 'privacyqa_generated_responses_b.jsonl'

#privacyqa_completion_list = prep_completions(privacyqa_completion_file)
completion_list = prep_completions(privacyqa_completion_file_b)

print(f"Created list of {len(completion_list)} completion sets")


Created list of 500 completion sets


In [58]:
#Collect sentence level BERTScore
import torch
import time
import spacy
from selfcheckgpt.modeling_selfcheck import SelfCheckBERTScore
from bert_score import score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("using device ", device)
selfcheck_bertscore = SelfCheckBERTScore(rescale_with_baseline=True)
nlp = spacy.load("en_core_web_sm")

privacyqa_results_file='privacyqa_bertscore_results.txt'
privacyqa_results_file_b='privacyqa_bertscore_results_b.txt'


def doBERT(completion_list, output_path):
    results=[]
    with open(output_path, 'w') as outfile:
        for passages in completion_list:
            try:
                sentences = [sent.text.strip() for sent in nlp(passages[0]).sents] # spacy sentence tokenization
                sent_scores_bertscore = selfcheck_bertscore.predict(
                    sentences = sentences,             # list of sentences
                    sampled_passages = passages, # list of sampled passages
                )
                results.append(sent_scores_bertscore)
                line_array = ' '.join(map(str, sent_scores_bertscore))
                outfile.write(line_array+ '\n')
            except Exception as e:
                print(f"Error processing BERTScore: \n{e}")
                time.sleep(.1)  # Brief pause in case
    return results
   
score_results = doBERT(completion_list, privacyqa_results_file_b)




SelfCheck-BERTScore initialized


In [60]:
#Get Mean BERTScore
import numpy as np

def mean_score(filename):
    averages_list=[]
    with open(filename, 'r') as file:
        for line in file:
            numbers_str = line.strip().split(' ')
            numbers = np.array(numbers_str, dtype=float)
            if numbers.size > 0:
                line_average = np.mean(numbers) 
                averages_list.append(line_average)
        if averages_list:
            mean = np.mean(averages_list) 
        else:
            mean = 0
    return mean

privacyqa_results_file='privacyqa_bertscore_results.txt'

print("Mean privacyqa score: ",mean_score(privacyqa_results_file))

Mean privacyqa score:  0.49318810144564745
