# ArthurBench: Specificity Demo

In [3]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from arthur_bench.run.testsuite import TestSuite

pd.set_option('display.max_colwidth', None)

# Prepare dataset

Using subset of 25 examples from "Explain like im Five" subreddit, from Stanford Human Preferences (SHP) Validation dataset. Filtered for `score_ratios` > 2.0.

In [14]:
eli5 = pd.read_csv('/content/eli5_25.csv')

eli5.head(1)

Unnamed: 0,history,score_A,score_B,human_ref_A,human_ref_B,labels,score_ratio
0,"Explain like I'm five years old: Since plastic doesn't biodegrade, why aren't more houses built with plastic instead of wood parts?",9,3,"Structural plastic is softer (weaker) and more expensive (or at least it was until the recent lumber shortage), and it's also susceptible to damage from UV rays in sunlight (plastic exposed to sunlight will become brittle and crack and break). Plastic with additives to deal with the UV are even more expensive. We do use a lot of plastic in home construction, just not structurally. Plastic sheeting for moisture barrier, plastic water pipes, plastic (or more accurately polymer) electrical junction boxes, plastic (polymer) wire insulation, kitchen cabinet veneers are sometimes made of plastics, etc.",Because heating/cooling your home would be problematic. I don't want to end up like a leftover in the back of the fridge.,1,3.0


# Make a test suite

In [15]:
suite_spec = TestSuite(
    name='specificity',
    scoring_method='specificity',
    reference_data=eli5,
    input_column='history'
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Run the tests


In [16]:
run_A = suite_spec.run(
    run_name="A",
    candidate_data=eli5,
    candidate_column='human_ref_A'
)

625it [00:00, 1228.29it/s]


In [22]:
run_B = suite_spec.run(
    run_name="B",
    candidate_data=eli5,
    candidate_column='human_ref_B'
)

625it [00:00, 3455.35it/s]


In [24]:
A_scores = []
for t in run_A.test_cases:
    A_scores.append(t.score)

B_scores = []
for t in run_B.test_cases:
    B_scores.append(t.score)

print(len(A_scores), len(B_scores))

25 25


## Compare to LLM Grading

In [None]:
#run llm scorer
import os
os.environ['OPENAI_API_KEY'] = ""##fill in
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate, HumanMessagePromptTemplate, BasePromptTemplate
import openai
import time
from tqdm import tqdm

In [None]:
#prompt - from https://github.com/i-Eval/FairEval/blob/main/FairEval.py

system_message_prompt = SystemMessagePromptTemplate.from_template(
  "You are a helpful and precise assistant for checking the helpfulness of an answer to a specific prompt."
  """We would like to request your feedback on the helpfulness of 2 responses to the PROMPT.
    Please rate the helpfulness, as measured by how useful, relevant and the level of details of the responses.

    Each response receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.
    Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.
    Then, output two lines indicating the scores for Response 1 and 2, respectively.

    Output with the following format:

    Evaluation evidence: <your evluation explanation here>
    Score of the Response 1: <score>
    Score of the Response 2: <score>"""
)

comparison_template = HumanMessagePromptTemplate.from_template(
    """
    PROMPT: {prompt}
    Response 1: {response_1}
    The end of Response 1.
    ----
    Response 2: {response_2}
    The end of Response 2.
    """
)

llm_evaluate = ChatPromptTemplate.from_messages([system_message_prompt, comparison_template])
llmchain= LLMChain(llm=ChatOpenAI(temperature=0, max_tokens=512), prompt=llm_evaluate)

def query(prompt, response_1, response_2):
    for i in range(4): #max API RETRY
        try:
            response = llmchain({"prompt": prompt, "response_1": response_1, "response_2": response_2})["text"]
            return response
        except openai.error.RateLimitError:
            print('rate limit')
            time.sleep(30)
        except Exception as e:
            print('error')
    raise RuntimeError(f"Failed after 4 retries.")

def parse_score_from_review(review):
    try:
        score1 = review.split("\n")[-2]
        score2 = review.split("\n")[-1]
        score1 = score1.split(":")[-1].strip()
        score2 = score2.split(":")[-1].strip()
        return [float(score1), float(score2)]
    except:
        print(f'Failed to parse scores from {review}')
        return [-1, -1]

def get_scores(prompts, responses_1, responses_2):
    llm_score_1= []
    llm_score_2= []
    all_scores=[]
    for i in tqdm(range(len(prompts))):
        score_1a, score_2a= parse_score_from_review(query(prompts[i], responses_1[i], responses_2[i]))
        score_2b, score_1b = parse_score_from_review(query(prompts[i], responses_2[i], responses_1[i]))
        all_scores.append([score_1a, score_1b, score_2a, score_2b])
        score_1 = (score_1a + score_1b)/2
        score_2 = (score_2a + score_2b)/2
        llm_score_1.append(score_1)
        llm_score_2.append(score_2)
    return llm_score_1, llm_score_2, all_scores

In [None]:
p= eli5['history'].values
r1=eli5['human_ref_A'].values
r2=eli5['human_ref_B'].values

llm_score_1, llm_score_2, all_scores = get_scores(p, r1, r2)

In [25]:
def assign_label(A,B):
  #it is 1 if A is preferred to B; 0 if B is preferred to A, 0.5 if it is a tie - according to SHP rules
  if A>B:
    return 1
  elif B>A:
    return 0
  else:
    return 0.5

In [28]:
llm_scores= pd.read_csv('/content/llm_scores_specificity.csv')
llm_scores.head(1)

Unnamed: 0,history,score_A,score_B,human_ref_A,human_ref_B,labels,score_ratio,llm_score_A,llm_score_B,llm_pref,human_llm_match
0,"Explain like I'm five years old: Since plastic doesn't biodegrade, why aren't more houses built with plastic instead of wood parts?",9,3,"Structural plastic is softer (weaker) and more expensive (or at least it was until the recent lumber shortage), and it's also susceptible to damage from UV rays in sunlight (plastic exposed to sunlight will become brittle and crack and break). Plastic with additives to deal with the UV are even more expensive. We do use a lot of plastic in home construction, just not structurally. Plastic sheeting for moisture barrier, plastic water pipes, plastic (or more accurately polymer) electrical junction boxes, plastic (polymer) wire insulation, kitchen cabinet veneers are sometimes made of plastics, etc.",Because heating/cooling your home would be problematic. I don't want to end up like a leftover in the back of the fridge.,1,3.0,8.5,2.5,1.0,True


In [30]:
llm_scores['human_llm_match'].value_counts()

True     13
False    12
Name: human_llm_match, dtype: int64

In [33]:
full_scores = llm_scores.copy()

full_scores['spec_score_A'] = A_scores
full_scores['spec_score_B'] = B_scores
full_scores['spec_pref'] = full_scores.apply(lambda x: assign_label(x['spec_score_A'], x['spec_score_B']), axis=1)

full_scores['human_spec_match']= full_scores['labels']==full_scores['spec_pref']

In [34]:
full_scores['human_spec_match'].value_counts()

True     19
False     6
Name: human_spec_match, dtype: int64

In [35]:
full_scores[full_scores.human_spec_match == True][full_scores.human_llm_match == False]

  full_scores[full_scores.human_spec_match == True][full_scores.human_llm_match == False]


Unnamed: 0,history,score_A,score_B,human_ref_A,human_ref_B,labels,score_ratio,llm_score_A,llm_score_B,llm_pref,human_llm_match,spec_score_A,spec_score_B,spec_pref,human_spec_match
3,Eli5 How exactly does Noise cancellation work? That too in such small airbuds,948,5,"You know how your headphones can reproduce any sound? Noise cancelling headphones have microphones to detect what outside sounds you're about to hear and then make the exact opposite sound at exactly the right time. Any sound (no matter how complex or loud) + its exact opposite = no sound at all, much like 5 + (-5) = 0. I could get into superposition and all that, but that's probably beyond age 5.","The sound you hear is a wave that is a sum of all sounds around you. Waves have a few relevant properties: They travel at a known velocity, and they are additive. To cancel such wave in your ear: we measure the wave just outside the ear and play its inverse with a small delay from the earphone. Notably, this only cancels the sound in a very small region around the inner side of the earphone. Everywhere else it adds its miniscule amount of more sound to the wave. For best results: You need a good microphone in both earphones, and a good algorithm to slightly alter the wave, to mimic hiw it will be altered by the earlobe (as the in-ear earphone sound is not altered by the earlobe identically to the sound coming from the outside). Fortunately, we can tune this individually: place a second microphone inside each ear canal (near the very tip of the earphone), and measure which delay and which amplitude modifications reduce the sound the most. A good analogue: Look at the waves in the see. Measure the height of the wave. If it is above the mean water level, push the water down with a paddle you have placed under the surface. If it is below, push the water up. If you move your paddle at just the right speed for a given measurement, you can destroy the wave around your paddle (whilst creating a new wave around your paddle, propagating outwards and adding a bit to the waves everywhere else in the sea).",1,189.6,6.0,9.0,0.0,False,0.50602,0.426907,1,True
6,"Explain like I'm five years old how is the US banking system not crippled by credit card fraud? I come from a country where most people don't have a credit card. I cannot wrap my head around this documentary. Now I get that credit card fraud *is* a big problem. But if all you need is the card number to order stuff, and card numbers are so easy to come by as shown in the documentary, then why is not every single criminal in the US running a credit card fraud operation? It seems super easy and low risk. Get a burner device, order stuff from public wifi, and don't be stupid about the delivery locations.",30,114,"I mean, you also have to remember that this particular scam was from back in the late '80s and early '90s. Banks have gotten *considerably* better at detecting fraudulent charges now, and even for the charges that slip through the total amount of money lost is pretty insignificant.",OP that documentary is from 1992. Banking and Credit Systems have evolved substantially in 30 years. You're thinking the entire US banking system is still being ran like its the 1980s.,0,3.8,8.5,7.0,1.0,False,0.5448,0.82614,0,True
8,"Explain like I'm five years old What's so special about water and why is it the most important substance in industrial processes and life on Earth? It's practically used for everything; drinking, cooking, cleaning, chemical reactions, generating electricity and the list goes on...but what makes it so suitable for basically everything?",5,11,"Off the top of my head... Availability and affordability, as it is the most common liquid on the planet. Being a singular substance, rather than a mix of others, meaning it can be contaminated, but not destroyed, by most processes. Relatively easy to purify, via filtering, boiling, etc. Relatively non toxic. Relatively non compressible as a liquid, which is useful in various applications. Highly reusable in processes such as steam powered turbines. I'm sure there are other particular chemical characteristics that make it useful in specific applications.","Water is an excellent solvent, many, many substances can be dissolved by it. There is a humongous amount of water on the Earth. As a result, it is one of the least expensive chemicals available.",0,2.2,8.5,5.5,1.0,False,0.513261,0.558839,0,True
11,"Explain like I'm five years old: London's population in 1900 was around 6 million, where did they all live?! I've seen maps of London at around this time and it is tiny compared to what it is now. Was the population density a lot higher? Did there used to be taller buildings? It seems strange to imagine so many people packed into such a small space. Ty",146,714,"Not specific to London, but here in North America population densities were crazy high per square meter of city space vs what we have now. Like a factor of 20x higher. Throughout the 1920s several of the 'problematic' city ordinances that urbanists like to hate on came into effect. They came into effect to effectively ban or break up many of the dwelling that were catering to the underclass. As an example, there were rooming houses dotted all over where one could rent a mattress on the floor for as little time as a night for what amounted to a couple bucks in today's money. These houses would be stuffed to full of migrant workers and other assorted poor people. Basically they were dens of disease, crime and filth and poverty. They also represent the market providing shelter for the bottom of the barrel and for those who may not have the right skin tone to stay in better accommodations. So, in parallel with the introduction of the car, the city passed zoning laws that forbade these places. Now there were max limits to how many people could stay in a dwelling. This shut down the boarding houses because the landlord can't make a go of it without jacking rates. (If you can't have 100 people paying a dollar per night, then you need to find one guy to pay 100 per night.) There were zones that industrial activity could take place and they must be separated from where people lived forcing folks to travel longer distances from home to job. People use to have 'servant' quarters in their back yard, but banned. This is kind of a poor explanation, but hopefully it gives some kind of a sense of what happened. At the end of the day, some of the changes were needed to combat rampant social disorder, but many of the changes were pushed to the extreme in order to try and entirely eliminate the 'undesirable' parts of the population. The problem has been that we've now created a system that nobody but the rich can afford to live.","They crammed everyone close together. For those homeless/sleeping rough: Penny sit-up: You could rest sitting on a bench but could not lay down, or really sleep (sleeping wasn't including in the price) Twopenny hangover: you would sleep hanging over a rope for two pennies Four-penny coffin: finally some rest laying down packed like sardines, infested with bugs though (so said Orwell) https://www.historic-uk.com/CultureUK/Two-Penny-Hangover/ This is for Victorian age but it continued into the 20th century (and tbf, so did the Victorian age)",0,4.890411,5.0,5.0,0.5,False,0.486942,0.594562,0,True
15,Explain like I'm five years old: why is a chip on a credit card considered ‘safer’ than swiping the magnetic strip?,2266,291,"The magnetic strip is like a secret code that lets you buy things. I can copy your secret code and use it to buy things. The chip is like a little man who makes secret codes that can each be used to buy one thing. I can copy the secret code but not the little man. Because the secret code only works once and for a limited time, and in one situation, stealing the secret code isn’t useful. You can’t steal the little man without doing a lot of work.","The chip is actually a tiny computer that is powered by the reader. It has a secret number inside of it that cannot be read. Only the bank knows the number. There's no way to ask it the secret number. Instead, you can only give it another number, and it will do some math on that number and its secret number and tell you another number. That's what happens when you read the card. The bank picks a number and asks the card to respond. The bank does the same math, and if your card has the same secret number it must be legit. Now, you're probably thinking someone could figure out the secret number by just getting it to do the math enough times. But the numbers involved are so big, this will take too long to be practical, more than 10 years to get enough numbers to have a shred of making a guess. Even with very modern computers. That's longer than your card's expiration date so it's fine. And if computers get fast enough the math fails, the banks can simply change the chips to use new algorithms and new, bigger numbers that take even longer to crack.",1,7.786942,6.5,9.0,0.0,False,0.500565,0.448071,1,True
17,Explain like I'm five years old: What exactly is phlegm and why/how does the human body produce so much of it?,5,2,"Mucous is mostly made up of water, and is given its properties by long, grippy protein strands. The protein acts like a mop, but much better. I can hold on to itself very well, along with water, and even more water between the strands. Your body has lots of water, and it takes a very small amount of protein to make mucus. So unless you're severely dehydrated, or on the verge of death by malnutrition, you have an endless supply of phlegm.","Phlegm aka mucus of the respiratory tract is a very useful substance. First lungs need to be wet so that oxygen can go into the body and carbon dioxide can go out. Mucus is good for this because it is thick like mayonnaise and won't immediately run down and pool in the bottom of your lungs. But even a jar of mayonnaise that you have shaken will eventually run down to the bottom of the jar. So secondly, to avoid this, our respiratory tract is lined with little hairs that can move and paddle the mucus upwards. This is great because in addition to preventing the mucus from pooling at the bottom of your lungs, it actually paddles the mucus up from your lungs and into your throat where it goes into your stomach. This leads to the third great thing about mucus because it helps to keep our lungs clean. All that dust in the air will collect in your lungs much like it collects in your house. That mucus traps the dust and it is pushed out of your lungs and into your stomach where you can eventually get rid of it by pooping.",1,2.5,7.5,9.0,0.0,False,0.445564,0.423027,1,True
18,"Explain like I'm five years old: If soap breaks down oil, why can’t oil be broken down with soap and then diluted to go down the drain?",128,18,"Soap/detergents don't break oil down. They simply emulsify it, or hold it in suspension. It is still 100% oil, just distributed differently... in the water rather than on the water.","It can, and is, all the time. Do you ever wash your dishes? Wash a pan you cooked with? When you do that, you use dish soap, and then the soap helps break down the oils, and washes down the drain. If you've ever worked on a car and got your hands all greasy, then you go inside and you use hot water and soap and you wash your hands really good, all those oils go down your drain. Where you run into problems, is when people pour oil and grease down their drain. Because it's not diluted enough, and it will then solidify in your pipes and cause them to clog. Then you pay big bucks to have it fixed.",1,7.111111,5.5,8.5,0.0,False,0.586834,0.432945,1,True
21,Explain like I'm five years old How do chip manufacturers keep their IP from being reversed engineered. How do chip manufacturers keep their IP from being reversed engineered? Is it too difficult for other companies to understand how the chips and boards were made and try to come up with how to make it themselves?,2,4,"Reverse engineering the process of how a chip was made is next to impossible. It's like 3D printing at a microscopic level with a plethora of different chemicals, metals and lasers. Then you have to scale and replicate while also having similar precision to what you're trying to copy. You'd have to have detailed data and plans and a lot of them copied, to get something that could be similar. Also lots of money.",I worked at Mostek in the 70s and 80s and we reverse engineered the 8086 and 68000 families. Our RAM and calculator chips were copied by many others. There was no law against it as copyright only covered the aesthetics and not function. Congress passed a law allowing copyright of photomask's and that ended it.,0,2.0,8.5,4.5,1.0,False,0.454712,0.661681,0,True
23,"Explain like I'm five years old: Why aren't cigarettes/tobacco a ""schedule 1"" drug, if they are scientifically proven to cause hundreds of thousands of deaths each year?",37,3,Scheduling of drugs is not done solely based on the scientific evidence. Political and economic concerns play a role.,"What 59179 said, but also because it is so widespread and in common usage, trying to limit it outright could lead to huge backlash (think prohibition). The good news is they're slowly phasing it out though, and finally getting more and more regulations in place.",1,12.333333,6.0,8.0,0.0,False,0.69125,0.443299,1,True
