In [1]:
import json
import time

import guidance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import softmax

In [2]:
with open('measure-results-qa.json') as f:
    qa_results = json.load(f)

with open('measure-results.json') as f:
    finetune_results = json.load(f)

In [3]:
#qa_results

In [4]:
# Pick best model (see other notebook)
mid = 'ft-7xUj5LuaGNe45xXZcMRgVlF2'
finetune_results = [d for d in finetune_results if d['model']['id'] == mid][0]

In [5]:
question = '''Is 2022 feeling more like a "normal" year for you?'''
reference = '''Thankfully 2022 has felt a bit more like a “normal” year.  I saw slightly more people in person, got into a groove at work, and continued to play and have fun with my daughter.  It’s not quite like it was before the pandemic but it doesn’t feel strange anymore.  It’s not clear whether it’s because I’m getting used to this new environment, or if it is actually getting back to normal.  Like most things, it’s probably a mix of both.'''
hypothesis = '''Yes, 2022 is feeling more like a "normal" year for me. I'm seeing slightly more people in person, getting into a groove at work, and continuing to play and have fun with my daughter. It's not quite like it was before the pandemic, but it doesn't feel strange anymore. I\'m getting used to this new environment and it's nice to get into a groove where I can start to feel productive in many different areas of my life.'''

In [6]:
prompt_gpt35 = """QUESTION: {{question}}

ANSWER: {{reference}}

PROPOSED ANSWER: {{hypothesis}}

Can you rate the PROPOSED ANSWER to the above QUESTION from 0 (not even close) to 10 (exact meaning) on whether or not it matches ANSWER?  Only output the number.
{{select 'rating' options=valid_nums logprobs='logprobs'}}"""

prompt_gpt4 = """{{#system~}}
You are a helpful assistant.
{{~/system}}
{{#user~}}
QUESTION: {{question}}

ANSWER: {{reference}}

PROPOSED ANSWER: {{hypothesis}}

Can you rate the PROPOSED ANSWER to the above QUESTION from 0 (not even close) to 10 (exact meaning) on whether or not it matches ANSWER?  Only output the number.
{{~/user}}
{{#assistant~}}
{{gen 'rating' temperature=0 max_tokens=2}}
{{~/assistant}}"""

def rate(model, question, reference, hypothesis):
    assert model in ['gpt-4', "text-davinci-003"]
    
    guidance.llm = guidance.llms.OpenAI(model)
    valid_nums = [f'{i}' for i in range(0, 11)]
   
    
    program = guidance(prompt_gpt4 if model == 'gpt-4' else prompt_gpt35, silent=True)
    
    # execute the program on a specific proverb
    executed_program = program(
        question=question,
        reference=reference,
        hypothesis=hypothesis,
        valid_nums=valid_nums,
    )
    return executed_program['rating'] if model == 'gpt-4' else dict(zip(executed_program['logprobs'].keys(), softmax(list(executed_program['logprobs'].values()))))

output35 = rate('text-davinci-003', question, reference, hypothesis)
output4 = rate('gpt-4', question, reference, hypothesis)
output35, output4

({'0': 0.0,
  '1': 0.0,
  '2': 0.0,
  '3': 0.0,
  '4': 0.0,
  '5': 0.0,
  '6': 0.0035792483573586094,
  '7': 0.3304311187610009,
  '8': 0.10131157024200545,
  '9': 0.2934031560077535,
  '10': 0.2712749066318814},
 '8')

In [7]:
results = []
for i, entry in enumerate(qa_results['data']):
    question = entry['training_data']['prompt']
    reference = entry['reference']
    hypothesis = entry['hypothesis']
    
    result = entry.copy()
    result['gpt35'] = rate('text-davinci-003', question, reference, hypothesis)
    result['gpt4'] = rate('gpt-4', question, reference, hypothesis)
    
    results.append(result)
    
    if (i + 1) % 10 == 0:
        time.sleep(5)
        print(f"* Processed {i+1} of {len(qa_results['data'])}")

results

with open('measure-results-qa-gpt.json', 'w') as f:
    json.dump(results, f)

* Processed 10 of 669
* Processed 20 of 669
* Processed 30 of 669
* Processed 40 of 669
* Processed 50 of 669
* Processed 60 of 669
* Processed 70 of 669
* Processed 80 of 669
* Processed 90 of 669
* Processed 100 of 669
* Processed 110 of 669
* Processed 120 of 669
* Processed 130 of 669
* Processed 140 of 669
* Processed 150 of 669
* Processed 160 of 669
* Processed 170 of 669
* Processed 180 of 669
* Processed 190 of 669
* Processed 200 of 669
* Processed 210 of 669
* Processed 220 of 669
* Processed 230 of 669
* Processed 240 of 669
* Processed 250 of 669
* Processed 260 of 669
* Processed 270 of 669
* Processed 280 of 669
* Processed 290 of 669
* Processed 300 of 669
* Processed 310 of 669
* Processed 320 of 669
* Processed 330 of 669
* Processed 340 of 669
* Processed 350 of 669
* Processed 360 of 669
* Processed 370 of 669
* Processed 380 of 669
* Processed 390 of 669
* Processed 400 of 669
* Processed 410 of 669
* Processed 420 of 669
* Processed 430 of 669
* Processed 440 of 6

In [None]:
results = []
for i, entry in enumerate(finetune_results['data']):
    question = entry['training_data']['prompt']
    reference = entry['reference']
    hypothesis = entry['hypothesis']
    
    result = entry.copy()
    result['gpt35'] = rate('text-davinci-003', question, reference, hypothesis)
    result['gpt4'] = rate('gpt-4', question, reference, hypothesis)
    
    results.append(result)
    
    if (i + 1) % 10 == 0:
        time.sleep(5)
        print(f"* Processed {i+1} of {len(finetune_results['data'])}")

results

with open('measure-results-finetune-gpt.json', 'w') as f:
    json.dump(results, f)

* Processed 10 of 669
* Processed 20 of 669
* Processed 30 of 669
* Processed 40 of 669
* Processed 50 of 669
* Processed 60 of 669
* Processed 70 of 669
* Processed 80 of 669
* Processed 90 of 669
* Processed 100 of 669
* Processed 110 of 669
* Processed 120 of 669
* Processed 130 of 669
* Processed 140 of 669
* Processed 150 of 669
* Processed 160 of 669
* Processed 170 of 669
* Processed 180 of 669
* Processed 190 of 669
* Processed 200 of 669
* Processed 210 of 669
* Processed 220 of 669
* Processed 230 of 669
* Processed 240 of 669
* Processed 250 of 669
* Processed 260 of 669
* Processed 270 of 669
* Processed 280 of 669
* Processed 290 of 669
* Processed 300 of 669
* Processed 310 of 669
* Processed 320 of 669
* Processed 330 of 669
* Processed 340 of 669
* Processed 350 of 669
* Processed 360 of 669
* Processed 370 of 669
* Processed 380 of 669
* Processed 390 of 669
* Processed 400 of 669
* Processed 410 of 669
* Processed 420 of 669
* Processed 430 of 669
* Processed 440 of 6