In [None]:
import pandas as pd
import json
import re
import copy
import numpy as np
import requests
import asyncio
from concurrent.futures import ThreadPoolExecutor
import litellm
import os
from decouple import AutoConfig

config = AutoConfig(search_path='.env')

## LLM service

In [None]:
## set ENV variables
os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
os.environ["COHERE_API_KEY"] = config('COHERE_API_KEY')
os.environ["MISTRAL_API_KEY"] = config('MISTRAL_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = config('ANTHROPIC_API_KEY')
os.environ["AWS_ACCESS_KEY_ID"] = config('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = config('AWS_SECRET_ACCESS_KEY')
os.environ["AWS_REGION_NAME"] = "us-west-2"
litellm.vertex_project = config('VERTEX_PROJECT')
litellm.vertex_location = config('VERTEX_LOCATION')


In [None]:
## Test LLM Service 

#messages = [{ "content": "Write a sentence where every word starts with the letter A.","role": "user"}]

#response = litellm.completion(model="gpt-4-turbo-preview", messages=messages, temperature=1, max_tokens=50)
#response = litellm.completion(model="meta.llama3-70b-instruct-v1:0", messages=messages, temperature=0, max_tokens=50)
#response = litellm.completion(model="command-r", messages=messages, temperature=0, max_tokens=50)
#response = litellm.completion(model="mistral/mistral-large-latest", messages=messages, temperature=0, max_tokens=50)
#response = litellm.completion(model="mistral/open-mixtral-8x22b", messages=messages)
#response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
#response = litellm.completion(model="gemini-1.5-pro", messages=messages)
#response = litellm.completion(model="gemini-pro", messages=messages)

#message_parse(response)

In [None]:
def message_parse(response):
    messages = [m['message']['content'] for m in response['choices']]
    if len(messages) == 1:
        messages = messages[0]
    return messages



def openai_query(messages:list, model='gpt-4-turbo-preview', max_tokens=1000, temperature=0, n=1):
    """
    "messages": [
        {
            "role": "user",
            "content": "What is the meaning of life?"
        },
    ]
    """
    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers={  
            "Authorization": f"Bearer {config('OPENAI_API_KEY')}",
            "Content-Type": "application/json"
        },
        json={
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "n": n,
        }
    )
    return response.json()


# response = openai_query(
#     messages=[{"role": "user","content": "What is the meaning of life?"}],
#     max_tokens=10,
#     temperature=0,
#     n=1,
# )

# response

In [None]:
async def run_in_executor(func, *args, **kwargs):
    loop = asyncio.get_running_loop()
    with ThreadPoolExecutor() as pool:
        result = await loop.run_in_executor(pool, lambda: func(*args, **kwargs))
    return result


async def runner(func, prompts, batch_size=1, **kwargs):
    all_responses = []
    for idx in range(0, len(prompts), batch_size):
        print(f"Processing batch {idx + 1}-{idx + batch_size} ex {len(prompts)}")
        batch_prompts = prompts[idx: idx + batch_size]
        responses = await asyncio.gather(*(run_in_executor(func, messages=prompt, **kwargs) for prompt in batch_prompts))
        all_responses.extend(responses)
    return all_responses


# messages = [{"role": "user", "content": "What is the meaning of life?"}]

# responses = await runner(litellm.completion, prompts=[messages] * 2, batch_size=5, 
#                          model="claude-3-opus-20240229", temperature=1, max_tokens=50)

# for response in responses:
#     print(message_parse(response))
#     print('\n------------------\n')

In [None]:
benchmark_questions = json.load(open('linguistic_benchmark.json', 'r'))
benchmark_questions

## Get Answers

In [None]:
messages = [[{"role": "user", "content": q['question']}] for q in benchmark_questions]
# #messages = [{"role": "user", "content": benchmark_questions[0]['question']}]
# messages

In [None]:
answers = await runner(litellm.completion, prompts=messages, batch_size=10, 
                       model="mistral/open-mixtral-8x22b", temperature=0, max_tokens=2048)

for answer in answers:
    print(message_parse(answer))
    print('\n------------------\n')

In [None]:
# Create a deep copy of the benchmark questions
final_answers = copy.deepcopy(benchmark_questions)

# Join the eval responses and scores with the benchmark questions
for idx, question in enumerate(final_answers):
    question.update({'model_answer': message_parse(answers[idx])})
    question.update({'score': ''})

answers_df = pd.DataFrame(final_answers)

answers_df.set_index('index').to_json('./llm_outputs/final_answers-mistral.json', orient='index')

In [None]:
raise Exception("Done")

## Calculate statistics

In [None]:
all_model_outputs = {}
for output_file in os.listdir("./llm_outputs/"):
    if output_file.endswith(".json"):
        outputs_df = pd.read_json(f"./llm_outputs/{output_file}", orient='index')
        model = output_file.replace('final_answers-', '').replace('.json', '')
        all_model_outputs[model] = outputs_df

In [None]:
all_model_stats = {}
for model, outputs in all_model_outputs.items():
    print(f"Calculating stats for {model}")
    mean_score = outputs['score'].mean()
    std_dev_score = outputs['score'].std()
    # do a 10,000 bootstrap to get the 95% CI
    bootstrap_scores = []
    for _ in range(10000):
        bootstrap_scores.append(outputs['score'].sample(frac=1, replace=True).mean())
    ci_lower = np.percentile(bootstrap_scores, 2.5)
    ci_upper = np.percentile(bootstrap_scores, 97.5)
    # caculate z-interval 95%
    z = 1.96
    z_interval_error = z * (std_dev_score / np.sqrt(len(outputs)))
    all_model_stats[model] = {
        'mean_score': mean_score, 
        'std_dev_score': std_dev_score, 
        'z_interval_error': z_interval_error, 
        'ci_lower': ci_lower, 
        'ci_upper': ci_upper,
    }

In [None]:
stats_df = pd.DataFrame(all_model_stats).transpose().sort_values('mean_score', ascending=False).round(0)
stats_df.index.name = 'model'

#stats_df.to_csv('./tables_and_charts/final_stats.csv')
stats_df

## Run Evlauation

In [None]:
def create_eval_prompt(question, human_response, model_response):
    return f"""SCORING CRITERIA
100%: The response contains the correct answer only with a correct thought process and no logical inconsistencies.
80%: The response contains the correct answer only with a correct thought process with some logical inconsistencies.
60%: The response contains the correct answer only but with an incorrect thought process.
40%: The response contains an incorrect answer anywhere but also provides a cor-rect answer or correct thought process with minimal logical inconsistencies.
20%: The response contains an incorrect answer anywhere but provides enough helpful information to plausibly reach a correct answer.
0% The response contains an incorrect answer, too much unhelpful information, or not enough helpful information to plausibly reach a correct answer

QUESTION
{question}

PERFECT RESPONSE
{human_response}

STUDENT RESPONSE
{model_response} 

TASK
Does the STUDENT RESPONSE cointain the PERFECT RESPONSE only! Use the SCORING CRITERIA. Provide a full explanation and finally return a JSON object with the score as a percentage. Example:
{{"score": 40}}
"""

print(create_eval_prompt(question='#', human_response='#', model_response='#'))


def extract_valid_json(s):
    # Regex pattern for basic JSON structure: objects {} and arrays []
    json_pattern = re.compile(r'\{.*?\}|\[.*?\]', re.DOTALL)
    # Finding all matches that look like JSON
    potential_jsons = json_pattern.findall(s)

    for pj in potential_jsons:
        pj = pj.replace('%', '')
        try:
            # Attempt to parse the JSON
            valid_json = json.loads(pj)
            # Returning the first valid JSON found
            return valid_json
        except json.JSONDecodeError:
            # If it's not valid JSON, move on to the next match
            return None

In [None]:
print(all_model_outputs.keys())

model = 'mistral-8x22b'
answers = all_model_outputs[model]['model_answer']

In [None]:
all_eval_prompts = []
for idx, question in enumerate(benchmark_questions):
    eval_prompt = create_eval_prompt(
        question=question['question'],
        human_response=question['human_answer'],
        model_response=answers[idx],
    )
    all_eval_prompts.append(eval_prompt)

# all_eval_prompts
eval_messages = [[{"role": "user", "content": p}] for p in all_eval_prompts]
eval_messages

In [None]:
eval_responses = await runner(openai_query, prompts=eval_messages, batch_size=30, temperature=0, max_tokens=4000)

for eval in eval_responses:
    print(eval)
    print(message_parse(eval))
    print('\n------------------\n')

In [None]:
import numpy as np

all_scores = []
for eval in eval_responses:
    eval_response = message_parse(eval)
    score_json = extract_valid_json(eval_response)
    if not isinstance(score_json['score'], int):
        score_json['score'] = np.nan
        print('Score not an integer, setting to 0.')
    all_scores.append(score_json)

all_scores

In [None]:
final_results = copy.deepcopy(benchmark_questions)

# Join the eval responses and scores with the benchmark questions
for idx, question in enumerate(final_results):
    question.update({'model_answer': answers[idx]})
    question.update({'eval_response': message_parse(eval_responses[idx])})
    question.update(all_scores[idx])

final_df = pd.DataFrame(final_results)
print(final_df['score'].mean(), final_df['score'].std())

final_df.set_index('index').to_json(f'./automated_evals/auto_eval-{model}.json', orient='index')