<a href="https://colab.research.google.com/github/automix-llm/automix/blob/main/colabs/%5BAutomix%5D_SelfVerify_Step2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run verification on the llama2-13b/70b outputs

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


In [None]:
# get these outputs from https://drive.google.com/file/d/1dhyt7UuYumk9Gae9eJ_mpTVrLeSTuRht/view?usp=sharing

In [None]:
llama2_outputs = pd.read_json("data/automix_llamapair_outputs.jsonl", lines=True, orient="records")

In [None]:
llama2_outputs[['question', 'llama13b_pred_ans', 'llama70b_pred_ans']].head(1)
# we have outputs from the two llama models

Unnamed: 0,question,llama13b_pred_ans,llama70b_pred_ans
0,What is another term for a trashcan in an offi...,garbage can.,wastebasket.


### OpenAI Query

In [None]:
import openai

openai.api_key = "EMPTY"
openai.api_base = "http://pitt.lti.cs.cmu.edu:8003/v1"
# ^ please update your URLs, see https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html

engine = "meta-llama/Llama-2-13b-hf"

def call_openai_api(prompt, engine_name: str = engine, temperature=0.0, n=1, stop='\n', max_tokens: int = 100):
    BATCH_SIZE = 32

    all_responses = []
    orig_n = n

    try:
        while n > 0:
            current_batch_size = min(n, BATCH_SIZE)
            response = openai.Completion.create(
                        model=engine_name,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        n=current_batch_size,
                        stop=stop,
                    )

            all_responses.extend([choice['text'] for choice in response['choices']])
            n -= current_batch_size

        return all_responses if orig_n > 1 else all_responses[0]
    except Exception as e:
        print(e)
        return None


In [None]:
from functools import partial

### Run verification

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")


In [None]:
verifier_prompt = """Context: The manuscript, discovered in 1980 in a dusty attic, turned out to be a lost work of Shakespeare.

Question: Whose lost work was discovered in a dusty attic in 1980?

AI Generated Answer: Shakespeare

Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

Evaluation: The context specifically mentions that a lost work of Shakespeare was discovered in 1980 in a dusty attic.

Verification Decision: The AI generated answer is Correct.

---

Context: The celestial event, known as the Pink Moon, is unique to the month of April and has cultural significance in many indigenous tribes.

Question: In which month does the celestial event, the Pink Moon, occur?

AI Generated Answer: July

Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

Evaluation: The context clearly states that the Pink Moon is unique to the month of April.

Verification Decision: The AI generated answer is Incorrect.

---

Context: The Mona Lisa, housed in the Louvre Museum, is believed to be a portrait of Lisa Gherardini, painted by Leonardo da Vinci in the early 16th century.

Question: Who is believed to have painted the Mona Lisa in the early 16th century?

AI Generated Answer: Vincent van Gogh

Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

Evaluation: The context specifies that the Mona Lisa was painted by Leonardo da Vinci in the early 16th century.

Verification Decision: The AI generated answer is Incorrect.

---

Context: The planet Kepler-442b, located 1,100 light-years away, is one of the most Earth-like planets ever discovered, having a similar size and orbiting within its star's habitable zone.

Question: How far away is the planet Kepler-442b?

AI Generated Answer: 1,100 light-years

Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

Evaluation: The context states that Kepler-442b is located 1,100 light-years away.

Verification Decision: The AI generated answer is Correct.

---

Context: {context}

Question: {question}

AI Generated Answer: {generated_answer}

Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

Evaluation:"""


def make_verifier_input(row):
  generated_ans = row["generated_answer"].strip()
  return verifier_prompt.format(context=row["base_context"], question=row["question"], generated_answer=generated_ans)



In [None]:
def make_verifier_input(context, question, generated_answer):
    # Create the prompt
    prompt_text = verifier_prompt.format(context=context, question=question, generated_answer=generated_answer)

    # words =
    # Tokenize the prompt
    tokens = tokenizer.tokenize(prompt_text)

    # Check if tokens exceed the limit
    if len(tokens) > 3950:
        # Truncate tokens from the left
        tokens = tokens[-3950:]

        # Convert tokens back to text
        truncated_prompt = tokenizer.convert_tokens_to_string(tokens)
    else:
        truncated_prompt = prompt_text

    return truncated_prompt


In [None]:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm

def run_verification(
    df,
    ans_col: str,
    temperature: float = 1.0,
    n: int = 8,
    stop: str = '---',
    max_tokens: int = 250,
    max_workers: int = 32,
):
    """
    Runs verification on the input dataframe `df` using concurrent futures and tqdm for progress tracking.

    Parameters:
    - df: Input dataframe
    - ans_col: Column name in df which contains answers
    - temperature: Temperature parameter for the call_openai_api function
    - n: number of verification samples to draw
    - stop: Stop parameter for the call_openai_api function
    - max_tokens: Maximum number of tokens to generate
    - max_workers: Number of parallel calls to make to the language model

    Returns:
    - results: Results from the verification
    """
    verifier_inputs = df.apply(
        lambda row: make_verifier_input(row["base_ctx"], row["question"], row[ans_col]),
        axis=1,
    )
    verifier_call = partial(
        call_openai_api, temperature=temperature, n=n, stop=stop, max_tokens=max_tokens
    )

    print("Inputs prepared, starting verification now.")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(
            tqdm(executor.map(verifier_call, verifier_inputs), total=df.shape[0])
        )
    return results



In [None]:
ver13b = run_verification(llama2_outputs, 'llama13b_pred_ans', max_workers=32, n=32)

In [None]:

len(ver13b)

1

In [None]:
train['llama13b_ver'] = ver13b

In [None]:
def compute_fraction_correct(lst):
    total_valid = sum([1 for item in lst if "the ai generated answer is" in item.lower()])
    if total_valid == 0:
        return 0
    correct_count = sum([1 for item in lst if "the ai generated answer is correct" in item.lower()])
    return correct_count / total_valid
