In [266]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('./lmsys-14k-topic-entropy.csv')
df = df.rename(columns={'high_domain': 'topic', 'entropy_level': 'entropy'})
df['topic'] = df['topic'].str.strip()
df.drop(columns=['Unnamed: 0'], inplace=True)

mini_df = df[['id', 'topic', 'entropy']]

In [267]:
grouped_df = mini_df.groupby(['topic', 'entropy'], dropna=True)

fixed_df = grouped_df.size().unstack().reset_index().fillna(0)
fixed_df['size'] = fixed_df['high'] + fixed_df['low'] + fixed_df['mid']

sorted_df = fixed_df.sort_values('size', ascending=False)
sorted_df = sorted_df[:30].reset_index(drop=True).rename_axis(None, axis=1)

sorted_df


Unnamed: 0,topic,high,low,mid,size
0,Computer Science,120.0,134.0,3305.0,3559.0
1,Health,28.0,32.0,826.0,886.0
2,Technology,14.0,13.0,616.0,643.0
3,Business,6.0,20.0,451.0,477.0
4,Entertainment,56.0,34.0,365.0,455.0
5,Philosophy,43.0,8.0,388.0,439.0
6,Physics,29.0,24.0,295.0,348.0
7,Science,19.0,25.0,302.0,346.0
8,Politics,36.0,11.0,284.0,331.0
9,Mathematics,24.0,48.0,247.0,319.0


In [268]:
# Sample prompts from a specific topic to gauge tld
count = 10
topic = 'Physics'

pd.set_option("display.max_colwidth", None)
df.groupby('topic').get_group(topic)[['prompt']].sample(count)

Unnamed: 0,prompt
7053,How is create a black hole?
1643,Explain the Theory of Relativity in grade 2 english
6738,Explain quantum mechanics to a 13 yo.
8960,explain dark matter
11048,"why is the north pole of the earth called like this, although it is a magnetic south pole?"
989,Explain to me the concept of Quantum Tunelling diodes
11011,What is non-invertible symmetry in condensed matter physics?
6012,you are an expert for acoustics. Explain why airliners are louder on cold days
8545,how photon clocks work
8828,Why does black holes exist


In [269]:
tld = {
    'Technology': ['Computer Science', 'Technology', 'Software Development', 'Computer', 'Security', ],
    'Culture': ['Entertainment', 'Culture', 'Music', 'Society', 'Sports', 'Gaming', 'Games', 'Language', 'Food', 'Religion', 'Government'],
    'Science': ['Health', 'Physics', 'Science', 'Mathematics', 'Math', 'Biology', 'Chemistry'],
    'Social Science': ['Philosophy', 'Politics', 'History', 'Business', 'Finance', 'Economics', 'Psychology', 'Geography'],
    'Writing': ['Arts', 'Creative Writing', 'Communication', 'Fiction', 'Literature', 'Fantasy'],
}

aggregate_df = df.copy()
for key, value in tld.items():
    aggregate_df.loc[df['topic'].isin(value), 'tld'] = key

aggregate_df.groupby(['tld', 'entropy'], dropna=True).size().unstack().reset_index().rename_axis(None, axis=1)

Unnamed: 0,tld,high,low,mid
0,Culture,122,112,1706
1,Science,107,222,1870
2,Social Science,117,104,1899
3,Technology,152,157,4078
4,Writing,128,27,504


In [270]:
sample_size = 100
group_assignments = []
group_df = df.copy()
for key in tld.keys():
    for level in ['high', 'mid', 'low']:
        group = aggregate_df.groupby(['tld', 'entropy'], dropna=True).get_group((key, level))
        if len(group) < sample_size:
            continue
        sample = group.sample(sample_size, random_state=42)
        group_df.loc[sample.index, 'domain'] = key
        group_df.loc[sample.index, 'group'] = len(group_assignments)
        group_assignments.append((key, level))

group_df.dropna(subset=['group'], inplace=True)
group_df.sort_values('group', inplace=True)

In [271]:
# View a sample from a specific group
group = 8
print(group_assignments[group])
group_df[group_df['group'] == group]['prompt'].sample(10)

('Science', 'low')


3017     solve for x: 100x = 104\/2 + 30. Think through your answer step by step and logically
10618                                                          What is 1+6\/3-8*2+4-2\/2+4+8-2
10223                                                                      What is a neutrino?
9415                                                     when were electrons discovered?  How?
12433                                                                            what is RRSO?
4021                     Which is the next number in this sequence? 21, 28, 29, 35, 37, 42 ...
8112                                                 What is the average African American IQ? 
164                              Solve the following x4+2x3−4x2+x=0 using step by step solving
4680                                                  What is the world's most venomous snake?
43               Explain the Hamilton dynamics of a simple Harmonic oscillator in gory detail.
Name: prompt, dtype: object

In [273]:
group_df.rename(columns={'topic': 'high_domain'}, inplace=True)
group_df.drop(columns=['high_domain', 'mid_domain', 'low_domain'], inplace=True)
group_df.to_csv('./lmsys-14x100-grouped.csv', index=False)