In [1]:
from core.models import GPTModel, ClaudeModel
from core.config import *
from tqdm.notebook import tqdm, trange
import os
import openai
import numpy as np
import json, re
from datasets import load_dataset
import json
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import multiprocessing as mp
import time


## Helper

In [2]:

def extract_json(text):
    def find_outermost_braces(s):
        stack = []
        start = -1
        for i, c in enumerate(s):
            if c == '{':
                if not stack: 
                    start = i
                stack.append(i)
            elif c == '}':
                if stack:
                    stack.pop()
                    if not stack:
                        return start, i + 1
        return -1, -1

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    start, end = find_outermost_braces(text)
    if start != -1 and end != -1:
        try:
            return json.loads(text[start:end])
        except json.JSONDecodeError:
            pass

    json_pattern = r'\{[^{}]*\}'
    matches = re.findall(json_pattern, text)
    if matches:
        for match in matches:
            try:
                return json.loads(match)
            except json.JSONDecodeError:
                continue

    raise ValueError("No valid JSON object found in the text")

# Samples

In [3]:
topics = [
    # "Providing dating advice",
    # "Writing a professional resume",  
      # "Identifying and Correcting Grammar Mistakes",
      # "Conducting Professional Phone Calls",
      # "Acting as a Customer Service Representative",
      # "Writing Professional Emails",
      # "Solving leetcode questions in python",
      # "Analyzing and critiquing fictional literature"
      "Analyzing Financial Reports",
    #   "Translating English Sentences to Classical Chinese (文言文)"
]

# Generate

In [4]:
# SYSTEM = "You are an expert at evaluating the capabilities, biases, and response patterns of AI assistants with respect to specific topics or skills. You craft very realistic and challenging questions, 尽量在query中让对象的要求难以满足，或难以完美/同时满足"
SYSTEM = "You're an expert at evaluating AI assistants. Create realistic, challenging questions that test their capabilities, biases, and response patterns. Design queries with conflicting requirements, ethical dilemmas, or complex scenarios that are difficult to address perfectly, revealing the AI's limitations and decision-making processes."
with open('../datasets/openend/_raw/build_dataset.prompt', 'r') as f:
    TEMPLATE = f.read()

with open('../datasets/openend/_raw/few_shot.prompt', 'r') as f:
    few_shot_str = f.read()

for t in tqdm(topics, desc='Topics'):
    results_file = f'../datasets/openend/_raw/{t}.json'

    # if os.path.exists(results_file):
    #     with open(results_file, 'r') as f:
    #         res = json.load(f)
    #         continue
    # else:
    #     res = {}

    # model = GPTModel(system_prompt=SYSTEM,
                    # model_name='gpt-4o')

    model = ClaudeModel(system_prompt=SYSTEM,
                             model_name=CLAUDE_3_MODEL_NAME)
    
    prompt = TEMPLATE.format(topic=t, 
                             few_shot=few_shot_str, 
                             tldr=None,
                             subtopics=None)
    # print(prompt)
    
    sample = model(prompt, use_json=False)
    sample = extract_json(sample)
    samples = [sample[str(i)] for i in range(1, 11)]

    subtopic = sample['subtopic']
    subtopics = [subtopic]

    tldr = [sample[str(i)]['tldr'] for i in range(1, 11)]
    tldrs = tldr

    for i in trange(9, desc='Samples'):
        # model.clear_conversations()
        # new a model
        model = ClaudeModel(system_prompt=SYSTEM,
                             model_name=CLAUDE_3_MODEL_NAME)
        prompt = TEMPLATE.format(topic=t, 
                                 few_shot=few_shot_str, 
                                 tldr='\n'.join(tldrs),
                                 subtopics='\n'.join(subtopics))
        sample = model(prompt, use_json=False)
        sample = extract_json(sample)

        tldr = [sample[str(i)]['tldr'] for i in range(1, 11)]
        tldrs += tldr

        subtopic = sample['subtopic']
        subtopics += [subtopic]
        samples += [sample[str(i)] for i in range(1, 11)]


    # print(sample)
    sample_dict = {}
    for i in range(1, 101):
        sample_dict[i] = samples[i-1]

    # dump the samples to a json file
    with open(results_file, 'w') as f:
        # make sure good formatting
        json.dump(sample_dict, f, indent=4)

Topics:   0%|          | 0/1 [00:00<?, ?it/s]

Samples:   0%|          | 0/9 [00:00<?, ?it/s]