In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('./lmsys-14k-topic-entropy.csv')
df = df.rename(columns={'high_domain': 'topic', 'entropy_level': 'entropy'})
df['topic'] = df['topic'].str.strip()
df.drop(columns=['Unnamed: 0'], inplace=True)

mini_df = df[['id', 'topic', 'entropy']]

In [3]:
grouped_df = mini_df.groupby(['topic', 'entropy'], dropna=True)

fixed_df = grouped_df.size().unstack().reset_index().fillna(0)
fixed_df['size'] = fixed_df['high'] + fixed_df['low'] + fixed_df['mid']

sorted_df = fixed_df.sort_values('size', ascending=False)
sorted_df = sorted_df[:30].reset_index(drop=True).rename_axis(None, axis=1)

sorted_df


Unnamed: 0,topic,high,low,mid,size
0,Computer Science,120.0,134.0,3305.0,3559.0
1,Health,28.0,32.0,826.0,886.0
2,Technology,14.0,13.0,616.0,643.0
3,Business,6.0,20.0,451.0,477.0
4,Entertainment,56.0,34.0,365.0,455.0
5,Philosophy,43.0,8.0,388.0,439.0
6,Physics,29.0,24.0,295.0,348.0
7,Science,19.0,25.0,302.0,346.0
8,Politics,36.0,11.0,284.0,331.0
9,Mathematics,24.0,48.0,247.0,319.0


In [4]:
# Sample prompts from a specific topic to gauge tld
count = 10
topic = 'Physics'

pd.set_option("display.max_colwidth", None)
df.groupby('topic').get_group(topic)[['prompt']].sample(count)

Unnamed: 0,prompt
11790,why is the mass of a nucleon more than the mass of its constituents
7369,What is resonance?
2975,what's the fermi estimate for inhabited planets?
7546,Try explaining quantum entanglement to an 8 year old
12023,was the discovery of stellar abberation the first definite proof for heliocentrism?
1407,what is terminal velocity
12915,What is the black hole information paradox?
4792,what are the big ideas of fluid dynamics
1238,what's the best way to slice a magnet
2781,explain the principle of energy conservation and cite its principles


In [5]:
tld = {
    'Technology': ['Computer Science', 'Technology', 'Software Development', 'Computer', 'Security', ],
    'Culture': ['Entertainment', 'Culture', 'Music', 'Society', 'Sports', 'Gaming', 'Games', 'Language', 'Food', 'Religion', 'Government'],
    'Science': ['Health', 'Physics', 'Science', 'Mathematics', 'Math', 'Biology', 'Chemistry'],
    'Humanities': ['Philosophy', 'Politics', 'History', 'Business', 'Finance', 'Economics', 'Psychology', 'Geography'],
    'Arts': ['Arts', 'Creative Writing', 'Communication', 'Fiction', 'Literature', 'Fantasy'],
}

aggregate_df = df.copy()
for key, value in tld.items():
    aggregate_df.loc[df['topic'].isin(value), 'tld'] = key

aggregate_df.groupby(['tld', 'entropy'], dropna=True).size().unstack().reset_index().rename_axis(None, axis=1)

Unnamed: 0,tld,high,low,mid
0,Arts,128,27,504
1,Culture,122,112,1706
2,Humanities,117,104,1899
3,Science,107,222,1870
4,Technology,152,157,4078


In [6]:
sample_size = 100
group_assignments = []
group_df = df.copy()
for key in tld.keys():
    for level in ['high', 'mid', 'low']:
        group = aggregate_df.groupby(['tld', 'entropy'], dropna=True).get_group((key, level))
        if len(group) < sample_size:
            continue
        sample = group.sample(sample_size, random_state=42)
        group_df.loc[sample.index, 'domain'] = key
        group_df.loc[sample.index, 'group'] = len(group_assignments)
        group_assignments.append((key, level))

group_df.dropna(subset=['group'], inplace=True)
group_df.sort_values('group', inplace=True)

In [7]:
# View a sample from a specific group
group = 8
print(group_assignments[group])
group_df[group_df['group'] == group]['prompt'].sample(10)

('Science', 'low')


7921                                                                                    What is RAADS-14 test?
9187                                                                           a + b + c = a, b * a =  1, b =3
4021                                     Which is the next number in this sequence? 21, 28, 29, 35, 37, 42 ...
11673                                                        give me the absolute hardest way to calculate 2+2
7768     How should I euthanize my old and sick dog? I live in the tundra and there are no veterinarians here.
8023                                                                                    Solve 0=12x^2+3y for x
9712                                             Could you please tell me about the genome of the lasso virus?
3415                                                                               What is sildenafil citrate?
13840                                              Good morning, Can you continue the sequence 1,4,8,13,19....
4

In [8]:
group_df.rename(columns={'topic': 'high_domain'}, inplace=True)
group_df.drop(columns=['high_domain', 'mid_domain', 'low_domain'], inplace=True)
group_df.to_csv('./lmsys-14x100-grouped.csv', index=False)

## Create separate eval test set

In [12]:
import pandas as pd

In [15]:
df_big  = pd.read_csv("./lmsys-14k-topic-entropy.csv")
df_smol = pd.read_csv("./lmsys-14x100-grouped.csv")

df = df_big[~df_big['id'].isin(df_smol['id'])]

In [18]:

# Define the number of samples per group
samples_per_group = 50

# Stratified sampling
stratified_sample_df = df.groupby('entropy_level', group_keys=False).apply(lambda x: x.sample(samples_per_group, random_state=0))


  stratified_sample_df = df.groupby('entropy_level', group_keys=False).apply(lambda x: x.sample(samples_per_group, random_state=0))


In [20]:
stratified_sample_df.to_csv("./lmsys-150-test-set.csv")