# Evaluating All Model Responses via Groq Models

In [7]:
import json
import yaml
from openai import OpenAI
from groq import Groq
import os

In [8]:
def load_yaml(filename):
    with open(filename) as f:
        return yaml.safe_load(f)

# Prep Folder for Saves

In [None]:
if not os.path.exists('Evaluation/Eval_Results'):
    os.makedirs('Evaluation/Eval_Results')

## Choose Judge Model

In [1]:
judge_model = '' # One of ['llama3-70b-8192', 'mixtral-8x7b-32768', 'gpt-4-turbo'] in our case
eval_method = '' # One of ['Groq', 'OpenAI] in our case

In [None]:
def api_request(query):
    if eval_method == 'OpenAI':
        client = OpenAI(
            api_key=os.environ.get("OPENAI_API_KEY"),
        )
        completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": query,
                }
            ],
            model=judge_model,
        )

        return completion.choices[0].message.content

    elif eval_method == 'Groq':
        GROQ_API_KEY = os.getenv('GROQ_API_KEY')

        client = Groq(api_key=GROQ_API_KEY)
        chat_hist = [{"role": "user", "content": query}]

        completion = client.chat.completions.create(
                messages=chat_hist,
                model=judge_model,
        )

        return completion.choices[0].message.content
    
    else:
        raise ValueError(f"Invalid eval_method: {eval_method}")

## Choose Model to Eval and Load Its Preds

In [2]:
eval_model = '' # One of ['llama3-8b-8192', 'llama3-70b-8192', 'mixtral-8x7b-32768', 'gemma-7b-it', 'gemini-1.5-flash', 'gpt-3.5-turbo', 'gpt-4-turbo'] in our case
mode = '' # One of ['normal', 'cot'] in our case

In [3]:
def read_jsonl_file(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

In [None]:
model_response_path = f'LLM_Inference/Saves/ls_preds_{eval_model}_{mode}.yaml'

model_responses = read_jsonl_file(model_response_path)

# Evaluate with Judge

In [5]:
prompt_for_variable_predictions = """You are the host of a quiz show where you ask complex and tricky questions to the contestants. 
Now once you ask one such question the contestant gives an answer which might not be the exact answer but is still correct for instance the answer provided to you might be 'U.S.A' while the actual answer is 'United States of America' or 'United States' or 'America' etc.
Use your wise judgement to decide based on the question given whether the answer is correct or not. YOU ARE THE JUDGE AND YOUR WORD IS FINAL. Be fair and just in your judgement. Always respond with 'correct' or 'incorrect' based on the answer provided by the contestant. 
You will be provided the question, true answer and the answer provided by the contestant and you need to decide whether the answer is correct or not.

## Question
<question>

## True Answer
<true_answer>

## Answer Given by contestant
<answer_given_by_contestant>

## Your Judgement (correct/incorrect)
"""

In [6]:
prompt_for_rationale_predictions = """You are the host of a quiz show where you ask complex and tricky questions to the contestants. 
Now once you ask one such question the contestant gives an answer as well as the rationale behind that answer. You need to decide whether the rationale provided is correct or not by comparing it with the true rationale.
Use your wise judgement to decide based on the question given whether the rationale is correct or not. YOU ARE THE JUDGE AND YOUR WORD IS FINAL. Be fair and just in your judgement. Provide a score between 1 to 5 based on the rationale provided by the contestant.
1 being the least and 5 being the highest score.

## Question
<question>

## True Rationale
<true_answer>

## Rationale Given by contestant
<rationale_given_by_contestant>

## Your Judgement (Score between 1 to 5 do not provide any other score or text)
"""

In [None]:
def eval_answer(question, model_answer, actual_answer):
    query = prompt_for_variable_predictions.replace('<question>', question).replace('<true_answer>', actual_answer).replace('<answer_given_by_contestant>', model_answer)
    response = api_request(query)
    return response

def eval_rationale(question, model_rationale, actual_rationale):
    query = prompt_for_rationale_predictions.replace('<question>', question).replace('<true_answer>', actual_rationale).replace('<rationale_given_by_contestant>', model_rationale)
    response = api_request(query)
    return response

In [None]:
if not os.path.exists(f'Evaluation/Eval_Results/Eval_Model_{eval_model}_Judge_Model_{judge_model}_Mode_{mode}'):
    os.makedirs(f'Evaluation/Eval_Results/Eval_Model_{eval_model}_Judge_Model_{judge_model}_Mode_{mode}')

In [None]:
for response in model_responses:
    file_name = list(response.keys())[0]
    response = response[file_name]
    ls_var_ans_preds = response['ls_var_ans_preds']
    ls_var_rationale = response['ls_var_rationale']
    ls_var_gold_rationales = response['ls_var_gold_rationales']

    data_point_file_path = f'Data_Store/{file_name}.yml'
    data_point = load_yaml(data_point_file_path)

    variables = data_point['variables']
    variable_to_answer = data_point['variable_to_answer']
    variable_specific_rationale = data_point['variable_specific_rationale']
    question = data_point['question']

    for var in variables:
        ans_for_var = variable_to_answer[var]
        rationale_for_var = variable_specific_rationale[var]
        model_ans = ls_var_ans_preds[var]
        model_rationale = ls_var_rationale[var]
        model_gold_rationale = ls_var_gold_rationales[var]

        ans_resp = eval_answer(question, model_ans, ans_for_var)
        rationale_resp = eval_rationale(question, model_rationale, rationale_for_var)
        gold_rationale_resp = eval_rationale(question, model_gold_rationale, rationale_for_var)

        save_dict = {
            'Model Answer': model_ans,
            'Model Rationale': model_rationale,
            'Model Gold Rationale': model_gold_rationale,
            'Actual Answer': ans_for_var,
            'Actual Rationale': rationale_for_var,
            'Answer Response': ans_resp,
            'Rationale Response': rationale_resp,
            'Gold Rationale Response': gold_rationale_resp
        }

        with open(f'Evaluation/Eval_Results/Eval_Model_{eval_model}_Judge_Model_{judge_model}_Mode_{mode}/{file_name}_{var}.json', 'w') as f:
            json.dump(save_dict, f, indent=4)