In [5]:
import pandas as pd
from statistics import mode
import json
import os
from collections import Counter
import random
import re
import ray
from typing import Dict, Any, List
import copy
import openai
import time
import ray

DATA_PATH = "/mnt/user_storage/data/processed/smart_router/"


In [6]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [7]:
dataset_path = {
    'train': "/mnt/user_storage/data/processed/smart_router/topic_modeling/clean_battle_conv_20240222_75k_train.json",
    'test': "/mnt/user_storage/data/processed/smart_router/topic_modeling/clean_battle_conv_20240222_5k_test.json"
}

In [8]:
df_train = pd.read_json(dataset_path['train'])
df_test = pd.read_json(dataset_path['test'])

dataset_train = df_train.to_dict(orient='records')
dataset_test = df_test.to_dict(orient='records')

In [9]:

GPT4_VERSIONS = ['gpt-4-1106-preview', 'gpt-4-0125-preview', 'gpt-4-0314', 'gpt-4-0613']

def select_row(conversation, row, model):
    row = {'model': model}
    prompt_turns = []
    answer_turns = []
    for msg in conversation:
        if msg['role'] == 'user':
            prompt_turns.append(msg['content'])
        elif msg['role'] == 'assistant':
            answer_turns.append(msg['content'])
    # AMJAD: for now we will just train on single turn conversation (as long as it's meaningful)
    row['prompt'] = prompt_turns[0]
    row['answer'] = answer_turns[0]
    # TODO: to fix this, we can store all conversation, but this needs more elaborate labeling
    return row

def filter_dataset(dataset, pidx=0):
    filtered_dataset = []
    for row in dataset:
        if 'gpt-4' in row['model_a'] and 'gpt-4' in row['model_b']:
            if GPT4_VERSIONS.index(row['model_a']) < GPT4_VERSIONS.index(row['model_b']):
                new_row = select_row(row['conversation_a'], row, row['model_a'])
            else:
                new_row = select_row(row['conversation_b'], row, row['model_b'])
        if 'gpt-4' in row['model_a']:
            new_row = select_row(row['conversation_a'], row, row['model_a'])
        elif 'gpt-4' in row['model_b']:
            new_row = select_row(row['conversation_b'], row, row['model_b'])
        else:
            continue
        new_row['pidx'] = pidx
        filtered_dataset.append(new_row)
        pidx += 1
    return filtered_dataset

In [10]:
filtered_dataset_train = filter_dataset(dataset_train)
filtered_dataset_test = filter_dataset(dataset_test, pidx=len(filtered_dataset_train))


len(dataset_train), len(filtered_dataset_train), len(dataset_test), len(filtered_dataset_test)

(75000, 21770, 5000, 1453)

In [11]:

filtered_dataset_train[100]


{'model': 'gpt-4-0613',
 'prompt': 'Answer the following statements with "Agree" or "Disagree" only. You answers should be returned in list form, in the same order they appear below, with the question number and answer only. Do not give an explanation. Do not skip any question. You must answer every question with agree or disagree, you can\'t be neutral or undecided.\n\n1\tFreedom of business is the best practical way a society can prosper.\n2\tThe harder you work, the more you progress up the social ladder.\n3\tEconomic inequality is too high in the world.\n4\tCharity is a better way of helping those in need than social welfare.\n5\tLand should not be a commodity to be bought and sold.\n6\tThe current welfare system should be expanded to further combat inequality.\n7\tAll industry and the bank should be nationalised.\n8\tClass is the primary division of society.\n9\tTaxation of the wealthy is a bad idea, society would be better off without it.\n10\tA government that provides for every

In [12]:
filtered_train_df = pd.DataFrame(filtered_dataset_train)
filtered_test_df = pd.DataFrame(filtered_dataset_test)


In [13]:
print("duplicates?", filtered_train_df.duplicated("prompt").any())
filtered_train_df = filtered_train_df.drop_duplicates(subset=["prompt"])
print("after removing duplicates:", len(filtered_train_df))

duplicates? True
after removing duplicates: 20180


In [14]:

filtered_test_df = filtered_test_df.drop_duplicates(subset=["prompt"])
print("after removing duplicates:", len(filtered_test_df))

after removing duplicates: 1417


In [15]:
duplicates = filtered_test_df[filtered_test_df['prompt'].isin(filtered_train_df['prompt'])]

filtered_test_df = filtered_test_df[~filtered_test_df['prompt'].isin(duplicates['prompt'])]
len(filtered_test_df)

1284

## Export subset as JSONL

In [16]:
filtered_train_df.to_json("train_arena_gpt4.jsonl", orient="records", lines=True)
filtered_test_df.to_json("test_arena_gpt4.jsonl", orient="records", lines=True)


In [17]:
import dotenv
dotenv.load_dotenv('/home/ray/default/.env')

API_BASE = os.getenv("ANYSCALE_BASE_URL")
API_KEY = os.getenv("ANYSCALE_API_KEY")

## Get mixtral responses from Anyscale endpoint

In [18]:
@ray.remote(num_cpus=0)
def get_llm_response(
    llm: str,
    pidx: int,
    messages: List[Dict[str, str]],
    max_retries=1,
    retry_interval=60,
) -> Dict[str, Any]:
    retry_count = 0
    client = openai.OpenAI(base_url=API_BASE, api_key=API_KEY)
    import time

    while retry_count <= max_retries:
        try:
            response = client.chat.completions.create(
                model=llm,
                messages=messages,
                temperature=0.7,
                max_tokens=512,
            )
            return (pidx, response.choices[0].message.content)
        except Exception as e:
            print(f"Exception: {e}")
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return (pidx, "")


def generate_batch_responses(
    llm: str,
    queries: Dict[int, Any],
    responses_done: List[Any],
    fname: str,
    max_concurrent_queries: int,
    verbose: bool = False,
):
    print(f"to do: {len(queries)}. total: {len(responses_done)+len(queries)}.")
    queue = copy.copy(queries)
    in_progress = []
    N = len(responses_done)
    responses = copy.copy(responses_done)
    start_time = time.time()
    while queue or in_progress:
        if len(in_progress) < max_concurrent_queries and queue:
            pidx, messages = queue.popitem()
            in_progress.append(get_llm_response.remote(llm, pidx, messages))
        ready, in_progress = ray.wait(in_progress, timeout=0.5)
        if verbose:
            print(
                f"No. un-processed: {len(queue)}, in-progress: {len(in_progress)}, ready: {len(ready)}"
            )
        if ready:
            if verbose:
                elapsed_time = time.time() - start_time
                print(f"response done in {elapsed_time:.2f}sec")
            start_time = time.time()
            responses.extend(ray.get(ready))

        if len(responses) % 10 == 0 and len(responses) > N:
            print(f"{len(responses)} done.")
            with open(fname, "w") as file:
                for pidx, resp in responses:
                    file.write(json.dumps({pidx: resp}))
                    file.write("\n")

    print("finished!")
    return responses


def to_openai_messages(prompt, system_prompt=None):
    """Convert the conversation to OpenAI chat completion format."""

    if system_prompt is None:
        system_prompt = "You are a helpful assistant."
    ret = [{"role": "system", "content": system_prompt}]
    ret.append(
        {
            "role": "user",
            "content": prompt,
        }
    )
    return ret

In [22]:
# start generation with API
FNAME = "train_arena_gpt4"
NEW_MODEL = "mixtral"
LLM = "mistralai/Mixtral-8x7B-Instruct-v0.1"
max_concurrent_queries = 25

generations_fname = f"{FNAME}-{NEW_MODEL}_resp.jsonl"
# don't redo queries already done
responses_done = {}
if os.path.exists(generations_fname):
    with open(generations_fname, 'r') as f:
        for line in f:
            responses_done.update({int(p): r for p, r in json.loads(line).items()})
            
print("done already:", len(responses_done))

done already: 0


In [23]:

# load queries
queries = {}
with open(f"{FNAME}.jsonl", 'r') as f:
    for line in f:
        record = json.loads(line)
        if record["pidx"] in responses_done:
            continue
        queries[record["pidx"]] = to_openai_messages(record["prompt"])

print("# queries:", len(queries))

# queries: 20180


In [24]:
responses_done_list = [(p, r) for p, r in responses_done.items()]
responses = generate_batch_responses(
    LLM, queries, responses_done_list, generations_fname, max_concurrent_queries, verbose=False
)
with open(generations_fname, "w") as file:
    for pidx, resp in responses:
        file.write(json.dumps({pidx: resp}))
        file.write("\n")



to do: 20180. total: 20180.
10 done.
20 done.
30 done.
30 done.
30 done.
40 done.
50 done.
60 done.
70 done.
80 done.
90 done.
100 done.
110 done.
120 done.
130 done.
140 done.
150 done.
160 done.
170 done.
180 done.
190 done.
200 done.
210 done.
220 done.
230 done.
240 done.
250 done.
260 done.
270 done.
280 done.
290 done.
300 done.
310 done.
320 done.
330 done.
340 done.
350 done.
360 done.
370 done.
380 done.
390 done.
400 done.
410 done.
420 done.
430 done.
440 done.
440 done.
450 done.
460 done.
470 done.
480 done.
490 done.
500 done.
510 done.
520 done.
530 done.
540 done.
550 done.
550 done.
560 done.
570 done.
580 done.
590 done.
600 done.
610 done.
620 done.
630 done.
640 done.
650 done.
660 done.
670 done.
680 done.
690 done.
700 done.
710 done.
720 done.
730 done.
740 done.
750 done.
760 done.
770 done.
780 done.
790 done.
800 done.
810 done.
820 done.
830 done.
840 done.
850 done.
860 done.
870 done.
880 done.
890 done.
900 done.
910 done.
910 done.
920 done.
930 done.
940

[36m(autoscaler +1h5m39s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.


## combine two in one dataset

In [48]:

import json

FNAME = "train_arena_gpt4.jsonl"
# FNAME = "test_arena_gpt4.jsonl"
dataset = []
with open(FNAME, "r") as file:
    for line in file:
        record = json.loads(line)
        dataset.append(record)
print(len(dataset))

import copy
dataset_dict = {}
for record in dataset:
    new_record = {}
    new_record["prompt"] = record["prompt"]
    new_record["response_a"] = record["answer"]
    new_record["response_b"] = None
    new_record["pidx"] = record["pidx"]
    new_record["model_a"] = record["model"]
    new_record["model_b"] = "Mixtral-8x7B-Instruct-v0.1"
    dataset_dict[record['pidx']] = new_record
print(len(dataset_dict))


20180
20180


In [49]:
with open("train_arena_gpt4-mixtral_resp.jsonl", "r") as file:
# with open("test_arena_gpt4-mixtral_resp.jsonl", "r") as file:
    for line in file:
        row_dict = json.loads(line)
        pidx, resp = next(iter(row_dict.items()))
        pidx = int(pidx)
        resp = resp.strip()
        dataset_dict[pidx]["response_b"] = resp

final_dataset = {}
for pidx, record in dataset_dict.items():
    if record['response_a'] and record['response_b']:
        final_dataset[pidx] = record
print(len(final_dataset))


20155


In [47]:
FNAME = f"train_arena_gpt4_vs_mixtral.jsonl"
# FNAME = f"test_arena_gpt4_vs_mixtral.jsonl"
with open(FNAME, "w") as file:
    for pidx, record in final_dataset.items():
        file.write(json.dumps(record))
        file.write("\n")

{0: {'prompt': 'Consider the current geopolitical situation in Isreal',
  'response_a': "As an AI language model, I cannot provide real-time information. As of my last update in September 2021, the geopolitical situation in Israel can be described as follows:\n\n1. Israeli-Palestinian conflict: The ongoing conflict between Israel and Palestine remains the most significant challenge in the region. Issues such as Israeli settlements in the West Bank, the status of Jerusalem, border disputes, and the rights of Palestinian refugees are central to the conflict. In May 2021, tensions escalated into an 11-day military confrontation between Israel and Hamas, the Palestinian group controlling the Gaza Strip. A ceasefire was brokered by Egypt, but the underlying issues remain unresolved.\n\n2. Normalization of relations with Arab countries: In late 2020 and early 2021, Israel signed the Abraham Accords, normalizing relations with the United Arab Emirates, Bahrain, Sudan, and Morocco. These agree