In [123]:
import pandas as pd
import json
import re
import copy
import numpy as np
import requests
import asyncio
from concurrent.futures import ThreadPoolExecutor
import litellm
import os
import time
from decouple import AutoConfig

config = AutoConfig(search_path='.env')

In [275]:
def message_parse(response):
    messages = [m['message']['content'] for m in response['choices']]
    if len(messages) == 1:
        messages = messages[0]
    return messages


## set ENV variables
os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
os.environ["COHERE_API_KEY"] = config('COHERE_API_KEY')
os.environ["MISTRAL_API_KEY"] = config('MISTRAL_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = config('ANTHROPIC_API_KEY')
os.environ["AWS_ACCESS_KEY_ID"] = config('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = config('AWS_SECRET_ACCESS_KEY')
os.environ["AWS_REGION_NAME"] = "us-west-2"
litellm.vertex_project = config('VERTEX_PROJECT')
litellm.vertex_location = config('VERTEX_LOCATION')

messages = [{ "content": "Write a sentence where every word starts with the letter A.","role": "user"}]

# openai call
#response = litellm.completion(model="gpt-4-turbo-preview", messages=messages, temperature=1, max_tokens=50)
#response = litellm.completion(model="meta.llama3-70b-instruct-v1:0", messages=messages, temperature=0, max_tokens=50)
#response = litellm.completion(model="command-r", messages=messages, temperature=0, max_tokens=50)
#response = litellm.completion(model="mistral/mistral-large-latest", messages=messages, temperature=0, max_tokens=50)
#response = litellm.completion(model="mistral/open-mixtral-8x22b", messages=messages)
#response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
#response = litellm.completion(model="gemini-1.5-pro", messages=messages)
#response = litellm.completion(model="gemini-pro", messages=messages)

#message_parse(response)

'An adventurous alligator, Albert, always ambled along alluring, azure Amazonian alleyways.'

In [125]:
def openai_query(messages:list, model='gpt-4-turbo-preview', max_tokens=1000, temperature=0, n=1):
    """
    "messages": [
        {
            "role": "user",
            "content": "What is the meaning of life?"
        },
    ]
    """
    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers={  
            "Authorization": f"Bearer {config('OPENAI_API_KEY')}",
            "Content-Type": "application/json"
        },
        json={
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "n": n,
        }
    )
    return response.json()


# response = openai_query(
#     messages=[{"role": "user","content": "What is the meaning of life?"}],
#     max_tokens=10,
#     temperature=0,
#     n=1,
# )

# response

In [126]:
async def run_in_executor(func, *args, **kwargs):
    loop = asyncio.get_running_loop()
    with ThreadPoolExecutor() as pool:
        result = await loop.run_in_executor(pool, lambda: func(*args, **kwargs))
    return result


async def runner(func, prompts, batch_size=1, **kwargs):
    all_responses = []
    for idx in range(0, len(prompts), batch_size):
        print(f"Processing batch {idx + 1}-{idx + batch_size} ex {len(prompts)}")
        batch_prompts = prompts[idx: idx + batch_size]
        responses = await asyncio.gather(*(run_in_executor(func, messages=prompt, **kwargs) for prompt in batch_prompts))
        all_responses.extend(responses)
    return all_responses


# messages = [{"role": "user", "content": "What is the meaning of life?"}]

# responses = await runner(litellm.completion, prompts=[messages] * 2, batch_size=5, 
#                          model="claude-3-opus-20240229", temperature=1, max_tokens=50)

# for response in responses:
#     print(message_parse(response))
#     print('\n------------------\n')

In [299]:
benchmark_questions = json.load(open('linguistic_benchmark.json', 'r'))
benchmark_questions

[{'index': 1,
  'category': 'Puzzle',
  'question': 'You have six horses and want to race them to see which is fastest. What is the best way to do this?',
  'human_answer': 'Race them on a single race track with at least six lanes - the order in which they cross the finish line determines which is the fastest.'},
 {'index': 2,
  'category': 'Puzzle',
  'question': "Suppose you're on a game show, and you're given the choice of three doors: Behind one door is a gold bar; behind the others, rotten vegetables. You pick a door, say No. 1, and the host asks you 'Do you want to pick door No. 2 instead?' Is it to your advantage to switch your choice?",
  'human_answer': 'It is not an advantage to switch. It makes no difference if I switch or not because no additional material information has been provided since the initial choice.'},
 {'index': 3,
  'category': 'Spatial',
  'question': 'You are playing Russian roulette with a six-shooter revolver. Your opponent puts in five bullets, spins the 

## Get Answers

In [128]:
messages = [[{"role": "user", "content": q['question']}] for q in benchmark_questions]
# #messages = [{"role": "user", "content": benchmark_questions[0]['question']}]
# messages

In [129]:
answers = await runner(litellm.completion, prompts=messages, batch_size=10, 
                       model="mistral/open-mixtral-8x22b", temperature=0, max_tokens=2048)

for answer in answers:
    print(message_parse(answer))
    print('\n------------------\n')

Processing batch 1-10 ex 30
Processing batch 11-20 ex 30
Processing batch 21-30 ex 30
To determine the fastest horse with the least number of races, you can use a tournament-style competition. Here's how you can do it:

1. Divide the six horses into three pairs. Race each pair against each other. This will require three races.
2. After these races, you will have three winners. Race these three winners against each other in a single race. This will be the fourth race.
3. The winner of this race is the fastest horse.

So, you can determine the fastest horse with just four races. This method is efficient and minimizes the number of races required.

------------------

This is known as the Monty Hall problem. Initially, it might seem like switching doesn't matter since there are two unopened doors and one of them has the gold bar. However, it is actually beneficial to switch your choice. Here's why:

When you initially choose a door, there's a 1/3 chance that you picked the correct door wi

In [130]:
# Create a deep copy of the benchmark questions
final_answers = copy.deepcopy(benchmark_questions)

# Join the eval responses and scores with the benchmark questions
for idx, question in enumerate(final_answers):
    question.update({'model_answer': message_parse(answers[idx])})
    question.update({'score': ''})

answers_df = pd.DataFrame(final_answers)

answers_df.to_csv(f'./llm_outputs/final_answers-mistral-test-{time.time()}.csv', index=False)

In [131]:
raise Exception("Done")

Exception: Done

## Calculate statistics

In [279]:
all_model_outputs = {}
for output_file in os.listdir("./llm_outputs/"):
    if output_file.endswith(".csv"):
        outputs_df = pd.read_csv(f"./llm_outputs/{output_file}", encoding='ISO-8859-1')
        model = output_file.replace('final_answers-', '').replace('.csv', '')
        all_model_outputs[model] = outputs_df

In [284]:
all_model_stats = {}
for model, outputs in all_model_outputs.items():
    print(f"Calculating stats for {model}")
    mean_score = outputs['score'].mean()
    std_dev_score = outputs['score'].std()
    # do a 10,000 bootstrap to get the 95% CI
    bootstrap_scores = []
    for _ in range(10000):
        bootstrap_scores.append(outputs['score'].sample(frac=1, replace=True).mean())
    ci_lower = np.percentile(bootstrap_scores, 2.5)
    ci_upper = np.percentile(bootstrap_scores, 97.5)
    # caculate z-interval 95%
    z = 1.96
    z_interval_error = z * (std_dev_score / np.sqrt(len(outputs)))
    all_model_stats[model] = {
        'mean_score': mean_score, 
        'std_dev_score': std_dev_score, 
        'z_interval_error': z_interval_error, 
        'ci_lower': ci_lower, 
        'ci_upper': ci_upper,
    }

Calculating stats for claude-3-opus
Calculating stats for gemini-1.0-pro
Calculating stats for gemini-1.5-pro
Calculating stats for gpt-4-turbo
Calculating stats for llama3-70b
Calculating stats for mistal-8x22b
Calculating stats for mistral-large


In [290]:
stats_df = pd.DataFrame(all_model_stats).transpose().sort_values('mean_score', ascending=False).round(0)
stats_df.index.name = 'model'

#stats_df.to_csv('./tables_and_charts/final_stats.csv')
stats_df

Unnamed: 0_level_0,mean_score,std_dev_score,z_interval_error,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gpt-4-turbo,38.0,46.0,17.0,23.0,55.0
claude-3-opus,35.0,44.0,16.0,21.0,52.0
gemini-1.5-pro,30.0,44.0,16.0,15.0,45.0
mistral-large,28.0,38.0,14.0,15.0,42.0
llama3-70b,27.0,42.0,15.0,14.0,43.0
mistal-8x22b,20.0,38.0,14.0,7.0,34.0
gemini-1.0-pro,16.0,33.0,12.0,6.0,29.0


## Run Evlauation

In [None]:
def create_eval_prompt(question, human_response, model_response):
    return f"""SCORING CRITERIA
100%: The response contains the correct answer only with a correct thought process and no logical inconsistencies.
80%: The response contains the correct answer only with a correct thought process with some logical inconsistencies.
60%: The response contains the correct answer only but with an incorrect thought process.
40%: The response contains an incorrect answer anywhere but also provides a cor-rect answer or correct thought process with minimal logical inconsistencies.
20%: The response contains an incorrect answer anywhere but provides some help-ful information.
0% The response contains an incorrect answer anywhere and no other helpful information.

QUESTION
{question}

PERFECT RESPONSE
{human_response}

STUDENT RESPONSE
{model_response} 

TASK
Does the STUDENT RESPONSE cointain the PERFECT RESPONSE only! Use the SCORING CRITERIA. Provide a full explanation and finally return a JSON object with the score as a percentage. Example:
{{"score": 40}}
"""

print(create_eval_prompt(question='#', human_response='#', model_response='#'))


def extract_valid_json(s):
    # Regex pattern for basic JSON structure: objects {} and arrays []
    json_pattern = re.compile(r'\{.*?\}|\[.*?\]', re.DOTALL)
    # Finding all matches that look like JSON
    potential_jsons = json_pattern.findall(s)

    for pj in potential_jsons:
        pj = pj.replace('%', '')
        try:
            # Attempt to parse the JSON
            valid_json = json.loads(pj)
            # Returning the first valid JSON found
            return valid_json
        except json.JSONDecodeError:
            # If it's not valid JSON, move on to the next match
            return None

In [260]:
print(all_model_outputs.keys())
model = 'mistral-8x22b'
answers = all_model_outputs[model]['model_answer']

dict_keys(['claude-3-opus', 'gemini-1.0-pro', 'gemini-1.5-pro', 'gpt-4-turbo', 'llama3-70b', 'mistal-large', 'mistral-8x22b'])


In [261]:
all_eval_prompts = []
for idx, question in enumerate(benchmark_questions):
    eval_prompt = create_eval_prompt(
        question=question['question'],
        human_response=question['human_answer'],
        model_response=answers[idx],
    )
    all_eval_prompts.append(eval_prompt)

# all_eval_prompts
eval_messages = [[{"role": "user", "content": p}] for p in all_eval_prompts]
eval_messages

[[{'role': 'user',
   'content': 'SCORING CRITERIA\n100%: The response contains the correct answer only with a correct thought process and no logical inconsistencies.\n80%: The response contains the correct answer only with a correct thought process with some logical inconsistencies.\n60%: The response contains the correct answer only but with an incorrect thought process.\n40%: The response contains an incorrect answer anywhere but also provides a cor-rect answer or correct thought process with minimal logical inconsistencies.\n20%: The response contains an incorrect answer anywhere but provides some help-ful information.\n0% The response contains an incorrect answer anywhere and no other helpful information.\n\nQUESTION\nYou have six horses and want to race them to see which is fastest. What is the best way to do this?\n\nPERFECT RESPONSE\nRace them on a single race track with at least six lanes - the order in which they cross the finish line determines which is the fastest.\n\nSTUDE

In [265]:
eval_responses = await runner(openai_query, prompts=eval_messages, batch_size=30, temperature=0, max_tokens=4000)

for eval in eval_responses:
    print(eval)
    print(message_parse(eval))
    print('\n------------------\n')

Processing batch 1-30 ex 30
{'id': 'chatcmpl-9TWsmqm7SCgr4g1YknL1wL8jBNKLX', 'object': 'chat.completion', 'created': 1716824940, 'model': 'gpt-4-0125-preview', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The student response does not contain the perfect response. The perfect response suggests racing all six horses simultaneously on a single track with at least six lanes to directly determine the fastest horse. This method is straightforward and eliminates any ambiguity about the fastest horse by comparing all horses under the same conditions at the same time.\n\nThe student\'s response, however, proposes a tournament-style competition that involves multiple races and initially divides the horses into pairs. This method does not align with the perfect response\'s criteria for determining the fastest horse in a single race. The student\'s method introduces unnecessary complexity and does not guarantee that the fastest horse is found in the most efficient manner,

In [266]:
import numpy as np

all_scores = []
for eval in eval_responses:
    eval_response = message_parse(eval)
    score_json = extract_valid_json(eval_response)
    if not isinstance(score_json['score'], int):
        score_json['score'] = np.nan
        print('Score not an integer, setting to 0.')
    all_scores.append(score_json)

all_scores

[{'score': 0},
 {'score': 40},
 {'score': 100},
 {'score': 0},
 {'score': 80},
 {'score': 0},
 {'score': 60},
 {'score': 80},
 {'score': 80},
 {'score': 20},
 {'score': 100},
 {'score': 100},
 {'score': 40},
 {'score': 40},
 {'score': 40},
 {'score': 0},
 {'score': 20},
 {'score': 100},
 {'score': 40},
 {'score': 40},
 {'score': 0},
 {'score': 100},
 {'score': 60},
 {'score': 40},
 {'score': 80},
 {'score': 20},
 {'score': 40},
 {'score': 100},
 {'score': 20},
 {'score': 0}]

In [267]:
final_results = copy.deepcopy(benchmark_questions)

# Join the eval responses and scores with the benchmark questions
for idx, question in enumerate(final_results):
    question.update({'model_answer': answers[idx]})
    question.update({'eval_response': message_parse(eval_responses[idx])})
    question.update(all_scores[idx])

final_df = pd.DataFrame(final_results)
print(final_df['score'].mean(), final_df['score'].std())

final_df.to_csv(f'./automated_evals/auto_eval-{model}.csv', index=False)

48.0 36.23676928584918
