In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with open('measure-results-qa.json') as f:
    qa_results = json.load(f)

with open('measure-results.json') as f:
    finetune_results = json.load(f)

In [None]:
#qa_results

In [None]:
#finetune_results

In [None]:
def compute_metrics(result):
    ret = []
    for d in result['data']:
        entry = {}
        entry['id'] = result['model']['id']
        entry['model'] = result['model']['model']
        if 'hyperparams' in result['model']:
            entry['n_epochs'] = result['model']['hyperparams']['n_epochs']
            entry['batch_size'] = result['model']['hyperparams']['batch_size']
            entry['learning_rate_multiplier'] = result['model']['hyperparams']['learning_rate_multiplier']
        else:
            entry['n_epochs'] = 0
            entry['batch_size'] = 0
            entry['learning_rate_multiplier'] = 0

        entry[f'rouge-1_f1'] = d['rouge']['rouge-1']['f']
        entry[f'rouge-2_f1'] = d['rouge']['rouge-2']['f']
        entry[f'rouge-l_f1'] = d['rouge']['rouge-l']['f']
        ret.append(entry)
    return pd.DataFrame(ret)


In [None]:
dfs = []
dfs.append(compute_metrics(qa_results))
for r in finetune_results:
    dfs.append(compute_metrics(r))

df_results = pd.concat(dfs)
df_results

In [None]:
df_results.groupby(['model', 'n_epochs', 'batch_size', 'learning_rate_multiplier', 'id']).agg(['mean', lambda x: x.quantile(0.25), 'median', lambda x: x.quantile(0.75)])

In [None]:
df = df_results[df_results['id'].isin(['QA-prompt_only', 'ft-7xUj5LuaGNe45xXZcMRgVlF2'])][['id', 'rouge-1_f1']]
df.groupby(['id']).agg(['mean', lambda x: x.quantile(0.25), 'median', lambda x: x.quantile(0.75)])

In [None]:
metric = 'rouge-2_f1'

df = df_results[df_results['id'].isin(['QA-prompt_only', 'ft-7xUj5LuaGNe45xXZcMRgVlF2'])][['id', metric]]
percentiles = df.groupby('id')[metric].quantile([i / 10.0 for i in range(1, 10)]).unstack()

# Set plot labels and title
fig, ax = plt.subplots()
percentiles.T.plot(ax=ax, marker='o')

# Set plot labels and title
#ax.set_xticks(range(len(percentiles.columns)))
ax.set_xticklabels(percentiles.columns)
ax.set_xlabel('Percentile')
ax.set_ylabel('Value')
plt.title('Percentiles by 10 for Each Group')
plt.legend(title='Group')

plt.show()

In [None]:
import rouge

hypothesis = "AI accelerators facilitate extensive text processing in large language models".lower()
reference = "Large language models use AI accelerators for improved processing and training.".lower()

rouge.Rouge().get_scores(hypothesis, reference)

In [None]:
6/11

# GPT Results

In [4]:
with open('measure-results-qa-gpt.json') as f:
    qa_results = json.load(f)

with open('measure-results-finetune-gpt.json') as f:
    ft_results = json.load(f)

In [7]:
qa_gpt4 = []
qa_gpt35 = []
for d in qa_results:
    mean = sum([int(k) * v for k, v in d['gpt35'].items()])
    qa_gpt35.append(mean)
    qa_gpt4.append(float(d['gpt4']))
    
s_qa_gpt35 = pd.Series(qa_gpt35)
s_qa_gpt4 = pd.Series(qa_gpt4)

pd.DataFrame({'gpt3.5': s_qa_gpt35, 'gpt4': s_qa_gpt4}).describe()

Unnamed: 0,gpt3.5,gpt4
count,669.0,669.0
mean,8.746007,7.64275
std,1.199723,2.631459
min,0.103222,0.0
25%,8.565983,7.0
50%,8.975328,8.0
75%,9.291991,10.0
max,9.932703,10.0


In [8]:
gpt4 = []
gpt35 = []
for d in ft_results:
    mean = sum([int(k) * v for k, v in d['gpt35'].items()])
    gpt35.append(mean)
    gpt4.append(float(d['gpt4']))
    
    
s_gpt35 = pd.Series(gpt35)
s_gpt4 = pd.Series(gpt4)

pd.DataFrame({'gpt3.5': s_gpt35, 'gpt4': s_gpt4}).describe()

Unnamed: 0,gpt3.5,gpt4
count,669.0,669.0
mean,8.048375,4.87145
std,1.256419,2.889933
min,0.015435,0.0
25%,7.511328,2.0
50%,8.243559,5.0
75%,8.81862,7.0
max,9.977577,10.0


# Examples

In [10]:
qa_results[0]

{'reference': ' Thankfully 2022 has felt a bit more like a “normal” year.  I saw \nslightly\n more people in person, got into a groove at work, and continued to play and have fun with my daughter.  It’s not quite like it was before the pandemic but it doesn’t feel strange anymore.  It’s not clear whether it’s because I’m getting used to this new environment, or if it is actually getting back to normal.  Like most things, it’s probably a mix of both.',
 'hypothesis': '\nYes, 2022 is feeling more like a "normal" year for me. I\'m seeing slightly more people in person, getting into a groove at work, and continuing to play and have fun with my daughter. It\'s not quite like it was before the pandemic, but it doesn\'t feel strange anymore. I\'m getting used to this new environment and it\'s nice to get into a groove where I can start to feel productive in many different areas of my life.',
 'training_data': {'prompt': 'QUESTION: Is 2022 feeling more like a "normal" year for you?\n\n###\n\n'

In [38]:
close = []
outliers_qa = []
outliers_ft = []
qa_wins = 0
ft_wins = 0
ties = 0
for i in range(len(gpt4)):
    example = {
        'reference': qa_results[i]['reference'],
        'qa': qa_results[i]['hypothesis'],
        'ft': ft_results[i]['hypothesis'],
        'question': qa_results[i]['training_data']['prompt'],
        'gpt4_qa': qa_gpt4[i],
        'gpt4_ft': gpt4[i],
    }
    if (gpt4[i] > qa_gpt4[i]):
        ft_wins += 1
    elif (qa_gpt4[i] > gpt4[i]):
        qa_wins += 1
    else:
        ties += 1
    if (gpt4[i] - qa_gpt4[i]) > 9:
        outliers_ft.append(example)
    if (qa_gpt4[i] - gpt4[i]) > 9:
        outliers_qa.append(example)
    elif abs(gpt4[i] - qa_gpt4[i]) < 1 and gpt4[i] > 8:
        close.append(example)

print(f'ft_wins: {ft_wins}\nqa_wins: {qa_wins}\nties:{ties}')
len(outliers_ft), len(outliers_qa), len(close)

ft_wins: 81
qa_wins: 486
ties:102


(8, 39, 21)

In [36]:
def print_example(example):
    print(f"{example['question']}")
    print(f" * Reference: {example['reference']}")
    print(f" * RAG Hypo ({example['gpt4_qa']}): {example['qa']}")
    print(f" * FT Hypo ({example['gpt4_ft']}):  {example['ft']}")

In [32]:
for x in outliers_qa:
    print_example(x)
    print("\n==============================================================================\n")

QUESTION: What are the reasons for the author's company being acquired?

###


 * Reference:  Six years ago I decided to take a leap of faith and join a diverse group of intellectually curious individuals on their journey to building the future of enterprise AI.  Of course none of us knew that at the time, we were just a bunch of wide eyed dreamers hoping for the best, desperately trying to find a product-market fit.  It seems like a lifetime ago when I was sitting in that cold, poorly lit room surrounded by cheap Ikea desks, stained carpets, and a large refrigerator box-turned-cubicle at 95 King Street.  The contrast to where we ended up could not be more stark: being acquired by one of the fastest growing supply chain management companies in the world for
 
our
 technology, people and products.
  The combination of these three reasons is a testament to everything that we built here.
 * RAG Hypo (10.0):  The company was acquired for its technology, people, and products. The stars alig

In [33]:
for x in outliers_ft:
    print_example(x)
    print("\n==============================================================================\n")

QUESTION: Are you making new habits?

###


 * Reference:  I’m trying to change that but making new habits takes time.
 * RAG Hypo (0.0): 
Yes, I am making new habits. I am trying to make the best hour of the day a habit by setting a low bar of reading one page per day. I also use the Due App to set daily reminders to help me stay on track. I am also trying to make writing a habit by setting aside time for it when I can.
 * FT Hypo (10.0):   I’m trying to change that but making new habits takes time.


QUESTION: Who wrote "My Life"?

###


 * Reference:  My Life (Bill Clinton)
 * RAG Hypo (0.0): 

The book "Models of My Life" was written by Herbert A. Simon. It is an autobiography looking at his distinguished multi-disciplinary career filled with tidbits of incredibly valuable wisdom throughout. It is a candid and witty autobiography that I highly recommend reading.
 * FT Hypo (10.0):   My Life (Bill Clinton)


QUESTION: What are your plans for the coming year?

###


 * Reference:  Th

In [34]:
for x in close:
    print_example(x)
    print("\n==============================================================================\n")

QUESTION: What has been the biggest event in your life?

###


 * Reference:  Family: 
The biggest event to happen to me has been on the personal front: my daughter was born this past summer!  She’s been the center of my life since she was born, and has now become my primary motivation for practically everything.  More money, more time, more knowledge?  It’s all so I can give and teach her the best that I can provide.
 * RAG Hypo (10.0): 
The biggest event to happen to me in 2020 has been on the personal front: my daughter was born this past summer! She's been the center of my life since she was born, and has become my primary motivation for practically everything. Having a daughter has given me a new sense of meaning in my life and has shifted my priorities and challenges. It's been an amazing experience and I'm so grateful for her.
 * FT Hypo (10.0):   Family: 
The biggest event has been the birth of my daughter.  She’s been the center of my life since she was born, and has now becom