In [20]:
from datasets import load_dataset
import pandas as pd
from statistics import mode
from dotenv import load_dotenv
import json
import os
from collections import Counter
import random
import re
import ray
from typing import Dict, Any, List
import copy
import openai
import time
DATA_PATH = "/mnt/user_storage/data/processed/smart_router/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [21]:
# FNAME = f"train_arena_gpt4_vs_mixtral.jsonl"
FNAME = f"test_arena_gpt4_vs_mixtral.jsonl"

full_dataset = []
with open(FNAME, "r") as file:
    for line in file:
        full_dataset.append(json.loads(line))
print("original samples:", len(full_dataset))
counter = Counter()
filtered_pidx = set()
for record in full_dataset:
    if record["response_a"].strip() == "":
        counter["response_a"] += 1
        filtered_pidx.add(record["pidx"])
    if record["response_b"].strip() == "":
        counter["response_b"] += 1
        filtered_pidx.add(record["pidx"])

dataset = [record for record in full_dataset if record["pidx"] not in filtered_pidx]
print("filtered dataset:", len(dataset))

original samples: 1283
filtered dataset: 1283


In [22]:
dataset[0]

{'prompt': 'Give a list of things that children (8-11) can do in the household',
 'response_a': "Children aged 8 to 11 are often eager to take on more responsibility and can handle a variety of tasks around the household. Here is a list of age-appropriate chores and activities that children in this age group can do:\n\n1. **Tidy Up Their Room**:\n   - Make their bed daily.\n   - Put away toys and books after use.\n   - Organize their closet and drawers.\n\n2. **Help with Meals**:\n   - Set the table for meals.\n   - Clear the table after meals.\n   - Help with meal prep (washing vegetables, simple cutting with supervision).\n   - Load and unload the dishwasher or wash dishes by hand.\n\n3. **Laundry Assistance**:\n   - Sort their own laundry by color.\n   - Fold and put away their clothes.\n   - Learn to use the washing machine and dryer with guidance.\n\n4. **Pet Care**:\n   - Feed and provide water for pets.\n   - Walk the dog with an adult or older sibling.\n   - Clean up after pets

In [23]:
dataset_df = pd.DataFrame(dataset)

In [24]:
dataset_df.head()

Unnamed: 0,prompt,response_a,response_b,pidx,model_a,model_b
0,Give a list of things that children (8-11) can...,Children aged 8 to 11 are often eager to take ...,"Sure, here are some activities and chores that...",21770,gpt-4-1106-preview,Mixtral-8x7B-Instruct-v0.1
1,Is OK KO made by the same person who made Stev...,"No, but they are related. OK K.O.! Let's Be He...","No, OK KO: Let's Be Heroes and Steven Universe...",21771,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1
2,give my cute wife a compliment! she is a great...,To your lovely wife: Your incredible wit and r...,Your wife is a truly special person! Her thoug...,21772,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1
3,Can you prove that there are no powers of 2 in...,The Lazy Caterer's sequence is a sequence in w...,"The Lazy Caterer's Sequence, also known as the...",21773,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1
4,I am not sure if I fell in love with someone. ...,"Of course, I'd be happy to help you understand...","Of course, I'd be happy to help you explore yo...",21774,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1


## convert to llm-as-judge-format

### question

In [25]:
# response_a is gpt-4 response
# response_b is mixtral response
import copy

question_df = copy.copy(dataset_df)

question_df['question_id'] = question_df['pidx']
question_df['turns'] = question_df.apply(lambda x: [x['prompt']], axis=1)
question_df.drop(columns=['response_a', 'response_b', 'prompt', 'pidx'], inplace=True)
display(question_df.head())
# question_df.to_json("../data/arena_train/question.jsonl", orient='records', lines=True)
question_df.to_json("../data/arena_test/question.jsonl", orient='records', lines=True)
    

Unnamed: 0,model_a,model_b,question_id,turns
0,gpt-4-1106-preview,Mixtral-8x7B-Instruct-v0.1,21770,[Give a list of things that children (8-11) ca...
1,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1,21771,[Is OK KO made by the same person who made Ste...
2,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1,21772,[give my cute wife a compliment! she is a grea...
3,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1,21773,[Can you prove that there are no powers of 2 i...
4,gpt-4-0314,Mixtral-8x7B-Instruct-v0.1,21774,[I am not sure if I fell in love with someone....


### gpt-4 response

In [9]:
# load current one to see what's missing
df = pd.read_json("../data/mt_bench/model_answer/gpt-4.jsonl", lines=True)
df.head()

Unnamed: 0,question_id,answer_id,model_id,choices,tstamp
0,81,SExkcwKTJKJh6JL95xHUni,gpt-4,"[{'index': 0, 'turns': ['Title: Aloha, Hawaii!...",1685359000.0
1,82,EeJPxZgNEGiZTADNf4bEdY,gpt-4,"[{'index': 0, 'turns': ['Subject: Request for ...",1685359000.0
2,83,Dt4WRiqPYUictU9rWYCWWP,gpt-4,"[{'index': 0, 'turns': ['I. Introduction  A...",1685359000.0
3,84,F2WTbjDdVBa2epQARv39wg,gpt-4,"[{'index': 0, 'turns': ['Subject: A Golden Opp...",1685359000.0
4,85,By7SLqkuT83NwC83XBKit8,gpt-4,"[{'index': 0, 'turns': ['Beneath a mane of unt...",1685359000.0


In [10]:
df['choices'].iloc[0][0].keys()

dict_keys(['index', 'turns'])

In [26]:
gpt4_df = copy.copy(dataset_df)
gpt4_df['question_id'] = gpt4_df['pidx']
gpt4_df['model_id'] = gpt4_df['model_a']
gpt4_df['choices'] = gpt4_df.apply(lambda x:[{"index":0, "turns":[x['response_a']]}], axis=1)
gpt4_df.drop(columns=['model_a', 'model_b', 'response_a', 'response_b', 'prompt', 'pidx'], inplace=True)
gpt4_df.head()

Unnamed: 0,question_id,model_id,choices
0,21770,gpt-4-1106-preview,"[{'index': 0, 'turns': ['Children aged 8 to 11..."
1,21771,gpt-4-0314,"[{'index': 0, 'turns': ['No, but they are rela..."
2,21772,gpt-4-0314,"[{'index': 0, 'turns': ['To your lovely wife: ..."
3,21773,gpt-4-0314,"[{'index': 0, 'turns': ['The Lazy Caterer's se..."
4,21774,gpt-4-0314,"[{'index': 0, 'turns': ['Of course, I'd be hap..."


In [27]:
len(gpt4_df), len(question_df)

(1283, 1283)

In [28]:
# gpt4_df.to_json("../data/arena_train/model_answer/gpt-4.jsonl", orient='records', lines=True)
gpt4_df.to_json("../data/arena_test/model_answer/gpt-4.jsonl", orient='records', lines=True)

#### mixtral response

In [29]:
mixtral_df = copy.copy(dataset_df)
mixtral_df['question_id'] = mixtral_df['pidx']
mixtral_df['model_id'] = 'mixtral'
mixtral_df['choices'] = mixtral_df.apply(lambda x:[{"index":0, "turns":[x['response_b']]}], axis=1)
mixtral_df.drop(columns=['model_a', 'model_b', 'response_a', 'response_b', 'prompt', 'pidx'], inplace=True)
mixtral_df.head()

Unnamed: 0,question_id,model_id,choices
0,21770,mixtral,"[{'index': 0, 'turns': ['Sure, here are some a..."
1,21771,mixtral,"[{'index': 0, 'turns': ['No, OK KO: Let's Be H..."
2,21772,mixtral,"[{'index': 0, 'turns': ['Your wife is a truly ..."
3,21773,mixtral,"[{'index': 0, 'turns': ['The Lazy Caterer's Se..."
4,21774,mixtral,"[{'index': 0, 'turns': ['Of course, I'd be hap..."


In [19]:
# mixtral_df.to_json("../data/arena_train/model_answer/mixtral.jsonl", orient='records', lines=True)
mixtral_df.to_json("../data/arena_test/model_answer/mixtral.jsonl", orient='records', lines=True)