In [2]:
from datasets import load_dataset
import pandas as pd
from statistics import mode
from dotenv import load_dotenv
import json
import os
from collections import Counter
import random
import re
import ray
from typing import Dict, Any, List
import copy
import openai
import time
DATA_PATH = "/mnt/user_storage/data/processed/smart_router/"

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [7]:
# full data with all subset (not just lmsys-chat)
FNAME = f"nectar_filtered_gpt-4_vs_mixtral.jsonl"

full_dataset = []
with open(os.path.join(DATA_PATH, FNAME), "r") as file:
    for line in file:
        full_dataset.append(json.loads(line))
print("original samples:", len(full_dataset))
counter = Counter()
filtered_pidx = set()
for record in full_dataset:
    if record["response_a"].strip() == "":
        counter["response_a"] += 1
        filtered_pidx.add(record["pidx"])
    if record["response_b"].strip() == "":
        counter["response_b"] += 1
        filtered_pidx.add(record["pidx"])

dataset = [record for record in full_dataset if record["pidx"] not in filtered_pidx]
print("filtered dataset:", len(dataset))

original samples: 121929
filtered dataset: 121898


In [8]:
dataset[0]

{'prompt': '0.002 = 1000 \n1 = x?',
 'source': ['sharegpt'],
 'pidx': 0,
 'response_a': 'If 0.002 equals 1000, then 1 would equal 500,000.',
 'response_b': 'If we combine the two equations, we get:\n\n0.002 = 1000 - x\n\nTo solve for x, we need to isolate x on one side of the equation. We can do this by subtracting 0.002 from both sides:\n\nx = 1000 - 0.002\n\nThis simplifies to:\n\nx = 999.998\n\nSo, if +0.002 equals 1000, then 1 is equivalent to 999.998.'}

In [10]:
dataset_df = pd.DataFrame(dataset)

In [12]:
dataset_df.head()

Unnamed: 0,prompt,source,pidx,response_a,response_b
0,0.002 = 1000 \n1 = x?,[sharegpt],0,"If 0.002 equals 1000, then 1 would equal 500,000.","If we combine the two equations, we get:\n\n0...."
1,0:00\nwhat's going on guys it's NAME_1 from\n0...,[lmsys-chat-1m],1,It seems like you're discussing the concept of...,Central Nervous System (CNS) fatigue refers to...
2,01011001 01001111 01010101 00100000 01000010 0...,[anthropic-hh],2,"The binary code you provided translates to ""YO...",Thank you! I'm glad to be of help. I see that ...
3,"""012345"", ""001122"", ""ee44aa"", ""abcdef"" are bea...",[lmsys-chat-1m],3,"Sure, here is a simple Golang function that ch...","Sure, I can help you with that! Based on your ..."
4,#01 You are an assistant that helps me to make...,[lmsys-chat-1m],4,Understood. I am here to assist you in making ...,"keep\_going\_straight, as it requires coordina..."


### counting data sources

In [13]:
dataset_df['source_str'] = dataset_df['source'].astype(str)
dataset_df.groupby("source_str").size()

source_str
['anthropic-hh', 'lmsys-chat-1m', 'sharegpt']        2
['anthropic-hh', 'sharegpt']                         9
['anthropic-hh']                                 18927
['evol_instruct', 'sharegpt']                        1
['evol_instruct']                                 9961
['false_qa', 'lmsys-chat-1m']                        1
['false_qa']                                      2306
['flan_v2_cot']                                   3000
['flan_v2_flan2021']                               461
['flan_v2_niv2', 'lmsys-chat-1m']                    1
['flan_v2_niv2']                                 11985
['flan_v2_p3']                                    1929
['lmsys-chat-1m', 'anthropic-hh']                   15
['lmsys-chat-1m', 'sharegpt']                       95
['lmsys-chat-1m']                                37651
['sharegpt']                                     24844
['truthful_qa', 'lmsys-chat-1m', 'sharegpt']         3
['truthful_qa', 'lmsys-chat-1m']                     2

## selecting a subset

In [27]:
source_list = ['lmsys-chat-1m']

subset_df = dataset_df[dataset_df['source'].apply(lambda x: any(item in x for item in source_list))]

print(len(subset_df))

37770


## convert to llm-as-judge-format

### question

In [47]:
# response_a is gpt-4 response
# response_b is mixtral response
import copy

question_df = copy.copy(subset_df)

question_df['question_id'] = question_df['pidx']
question_df['turns'] = question_df.apply(lambda x: [x['prompt']], axis=1)
question_df.drop(columns=['response_a', 'response_b', 'source_str', 'prompt', 'pidx'], inplace=True)
display(question_df.head())
question_df.to_json("data/nectar_data/question.jsonl", orient='records', lines=True)
    

Unnamed: 0,source,question_id,turns
1,[lmsys-chat-1m],1,[0:00\nwhat's going on guys it's NAME_1 from\n...
3,[lmsys-chat-1m],3,"[""012345"", ""001122"", ""ee44aa"", ""abcdef"" are be..."
4,[lmsys-chat-1m],4,[#01 You are an assistant that helps me to mak...
6,[lmsys-chat-1m],6,[0. Definition of terms\n0-1. Input is a list ...
7,[lmsys-chat-1m],7,[0 looks like O and 1 looks like I. What does ...


### gpt-4 response

In [59]:
# load current one to see what's missing
df = pd.read_json("data/mt_bench/model_answer/gpt-4.jsonl", lines=True)
df.head()

Unnamed: 0,question_id,answer_id,model_id,choices,tstamp
0,81,SExkcwKTJKJh6JL95xHUni,gpt-4,"[{'index': 0, 'turns': ['Title: Aloha, Hawaii!...",1685359000.0
1,82,EeJPxZgNEGiZTADNf4bEdY,gpt-4,"[{'index': 0, 'turns': ['Subject: Request for ...",1685359000.0
2,83,Dt4WRiqPYUictU9rWYCWWP,gpt-4,"[{'index': 0, 'turns': ['I. Introduction  A...",1685359000.0
3,84,F2WTbjDdVBa2epQARv39wg,gpt-4,"[{'index': 0, 'turns': ['Subject: A Golden Opp...",1685359000.0
4,85,By7SLqkuT83NwC83XBKit8,gpt-4,"[{'index': 0, 'turns': ['Beneath a mane of unt...",1685359000.0


In [57]:
df['choices'].iloc[0][0].keys()

dict_keys(['index', 'turns'])

In [60]:
gpt4_df = copy.copy(subset_df)
gpt4_df['question_id'] = gpt4_df['pidx']
gpt4_df['model_id'] = 'gpt-4'
gpt4_df['choices'] = gpt4_df.apply(lambda x:[{"index":0, "turns":[x['response_a']]}], axis=1)
gpt4_df.drop(columns=['response_a', 'response_b', 'source_str', 'prompt', 'pidx'], inplace=True)
gpt4_df.head()

Unnamed: 0,source,question_id,model_id,choices
1,[lmsys-chat-1m],1,gpt-4,"[{'index': 0, 'turns': ['It seems like you're ..."
3,[lmsys-chat-1m],3,gpt-4,"[{'index': 0, 'turns': ['Sure, here is a simpl..."
4,[lmsys-chat-1m],4,gpt-4,"[{'index': 0, 'turns': ['Understood. I am here..."
6,[lmsys-chat-1m],6,gpt-4,"[{'index': 0, 'turns': ['Understood. Here's ho..."
7,[lmsys-chat-1m],7,gpt-4,"[{'index': 0, 'turns': ['The number 2 doesn't ..."


In [61]:
len(gpt4_df), len(question_df)

(37770, 37770)

In [63]:
gpt4_df.to_json("data/nectar_data/model_answer/gpt-4.jsonl", orient='records', lines=True)

#### mixtral response

In [66]:
mixtral_df = copy.copy(subset_df)
mixtral_df['question_id'] = mixtral_df['pidx']
mixtral_df['model_id'] = 'mixtral'
mixtral_df['choices'] = mixtral_df.apply(lambda x:[{"index":0, "turns":[x['response_b']]}], axis=1)
mixtral_df.drop(columns=['response_a', 'response_b', 'source_str', 'prompt', 'pidx'], inplace=True)
mixtral_df.head()

Unnamed: 0,source,question_id,model_id,choices
1,[lmsys-chat-1m],1,mixtral,"[{'index': 0, 'turns': ['Central Nervous Syste..."
3,[lmsys-chat-1m],3,mixtral,"[{'index': 0, 'turns': ['Sure, I can help you ..."
4,[lmsys-chat-1m],4,mixtral,"[{'index': 0, 'turns': ['keep\_going\_straight..."
6,[lmsys-chat-1m],6,mixtral,"[{'index': 0, 'turns': ['Understood. I will ge..."
7,[lmsys-chat-1m],7,mixtral,"[{'index': 0, 'turns': ['The digit ""2"" does no..."


In [65]:
mixtral_df.to_json("data/nectar_data/model_answer/mixtral.jsonl", orient='records', lines=True)

In [4]:
len(set([1 , 2, 4]) & set([ 5,6])) == 0

True