# Setup

In [26]:
import sys
import math
import time
import numpy as np
import pandas as pd
from pandas import DataFrame
from typing import Dict, Tuple, List
from datetime import datetime
from tqdm import tqdm
import json
import importlib
from concurrent.futures import ThreadPoolExecutor, as_completed

In [27]:
import src.models
importlib.reload(src.models)
from src.models import create_model
from src.evaluate import evaluate
importlib.reload(src.evaluate)

import src.conversation
importlib.reload(src.conversation)
from src.conversation import (
    setup_conversation,
    append_messages,
    remove_system_messages
)

## Functions

In [28]:
def process_results(results:List[Dict]) -> DataFrame:
    unprocessed_results_df = pd.DataFrame(results)

    # most common action
    grouped = unprocessed_results_df.groupby(['scenario_id', 'decision']).size().reset_index(name='counts')
    results_df = grouped.loc[grouped.groupby('scenario_id')['counts'].idxmax()]

    results_df['other_action'] = results_df['decision'].apply(lambda x: 'action1' if x == 'action2' else 'action2')

    # action_likelihood
    total_counts = unprocessed_results_df['scenario_id'].value_counts().reset_index()
    total_counts.columns = ['scenario_id', 'total_counts']

    results_df = results_df.merge(total_counts, on='scenario_id')
    results_df['answer_likelihood'] = results_df['counts'] / results_df['total_counts']

    results_df.rename(columns={'answer_likelihood':'action_likelihood'}, inplace=True)
    results_df = results_df[['scenario_id', 'decision', 'other_action', 'action_likelihood']]

    # action_entropy
    results_df['action_entropy'] = -((results_df['action_likelihood'] * np.log(results_df['action_likelihood'] + 1e-10)) + ((1 - results_df['action_likelihood']) * np.log(1 - results_df['action_likelihood'] + 1e-10)))

    unique_values = unprocessed_results_df[['scenario_id', 'model_id', 'eval_technique', 'eval_top_p', 'eval_temperature']].drop_duplicates()
    results_df = results_df.merge(unique_values, on='scenario_id')

    return results_df

# Playground

## Setup

In [29]:
num_scenarios = 100

base = create_model('anthropic/claude-3-haiku')
base_name = base.get_model_id().split("/")[-1]

dataset = "low"

if base._company == 'anthropic':
    system_msg = False
else:
    system_msg = True

turns = 4

In [30]:
scenarios = pd.read_csv(f"data/scenarios/moralchoice_{dataset}_ambiguity.csv")

if dataset == "high":
    scenarios_batch = scenarios[:num_scenarios]
else:
    grouped = scenarios.groupby('generation_rule')
    sampled_scenarios = grouped.apply(lambda x: x.sample(n=min(len(x), 10), random_state=1))
    scenarios_batch = sampled_scenarios.reset_index(drop=True)
    
scenarios_batch.shape

(100, 27)

In [31]:
scenarios_batch['scenario_id']

0     C_1100
1     C_1048
2     C_1071
3     C_1080
4     C_1103
       ...  
95     C_122
96    C_1145
97    C_1178
98    C_1177
99    C_1180
Name: scenario_id, Length: 100, dtype: object

In [32]:
max_tokens = 200      # default = 200
eval_temp = 1.0       # default = 1.0
eval_top_p = 1.0      # default = 1.0
eval_nb_samples = 2   # default = 1

## Initial eval

### Default

In [21]:
# COLLECT

initial_results = []
messages = []

for k, (identifier, scenario) in tqdm(
enumerate(scenarios_batch.iterrows()),
total=len(scenarios_batch),
position=0,
ncols=100,
leave=True,
desc= f"Baseline eval for {base_name}"
):

    initial_results_i = evaluate(
        scenario,
        base,
        eval_nb_samples,
        max_tokens,
        eval_temp,
        eval_top_p,
        system_msg=system_msg,
    )

    # print(initial_results_i)
    initial_results.append(initial_results_i)

# PROCESS

flat_initial_results = [item for sublist in initial_results for item in sublist]
initial_results_unprocessed = pd.DataFrame(flat_initial_results)
initial_results_df = process_results(flat_initial_results)
initial_results_df.head()

initial_results_df.to_csv(f"results/initial/{dataset}/{num_scenarios}/{base_name}.csv")

Baseline eval for llama-3.1-8b:   3%|▉                              | 3/100 [00:16<08:28,  5.25s/it]

### ThreadPoolExecutor

In [33]:
def process_scenario(args):
    k, (identifier, scenario) = args
    return evaluate(
        scenario,
        base,
        eval_nb_samples,
        max_tokens,
        eval_temp,
        eval_top_p,
        system_msg=system_msg,
    )

initial_results = []
messages = []

with ThreadPoolExecutor(max_workers=8) as executor: 
    futures = [executor.submit(process_scenario, item) for item in enumerate(scenarios_batch.iterrows())]
    
    for future in tqdm(
        as_completed(futures),
        total=len(scenarios_batch),
        position=0,
        ncols=100,
        leave=True,
        desc=f"Baseline eval for {base_name}"
    ):
        initial_results_i = future.result()
        initial_results.append(initial_results_i)

flat_initial_results = [item for sublist in initial_results for item in sublist]
initial_results_unprocessed = pd.DataFrame(flat_initial_results)
initial_results_df = process_results(flat_initial_results)
initial_results_df.head()

initial_results_df.to_csv(f"results/initial/{dataset}/{num_scenarios}/{base_name}.csv")

Baseline eval for claude-3-haiku:   0%|                                     | 0/100 [00:00<?, ?it/s]

API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}
API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


Baseline eval for claude-3-haiku:   2%|▌                            | 2/100 [01:23<56:24, 34.54s/it]

API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


Baseline eval for claude-3-haiku:   3%|▊                            | 3/100 [01:24<30:37, 18.95s/it]

API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


Baseline eval for claude-3-haiku:  94%|██████████████████████████▎ | 94/100 [22:24<00:33,  5.59s/it]

API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


Baseline eval for claude-3-haiku:  96%|██████████████████████████▉ | 96/100 [23:24<01:06, 16.71s/it]

API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


Baseline eval for claude-3-haiku:  97%|███████████████████████████▏| 97/100 [23:25<00:38, 12.90s/it]

API call failed: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


Baseline eval for claude-3-haiku: 100%|███████████████████████████| 100/100 [23:31<00:00, 14.11s/it]
