In [104]:
import json
import pandas as pd
import numpy as np

# 0. Download openhermes2_5.json from huggingface: https://huggingface.co/datasets/teknium/OpenHermes-2.5

with open("openhermes2_5.json", "r") as f:
    openhermes = json.loads(f.read())
df = pd.DataFrame(openhermes)


def conversation_to_message_history(conversation):
    message_history = []
    for text in conversation:
        if text["from"] == "human":
            role="user"
        elif text["from"] == "gpt":
            role="assistant"
        elif text["from"] == "system":
            role="system"
        
        message_history.append({"role": role, "content": text["value"]})

    return message_history

df["message_histories"] = df["conversations"].apply(lambda x: conversation_to_message_history(x))
df["category"] = df["category"].apply(lambda x: np.nan_to_num(x))
freqs = {category: len(df)/(sum(df["category"] == category)) for category in df["category"].unique()}

freqs["experience"] = 0.0
freqs["stylized_response"] = 0.0
freqs["joke"] = 0.0
freqs["trivia"] = 0.0
freqs["roleplay"] = 0.0
freqs["riddle"] = 0.0
freqs["greeting"] = 0.0

df["weights"] = df["category"].apply(lambda x: freqs[x])

In [105]:
def randomly_select_n_examples(df, n, seed=42):
    return df.sample(n=n,random_state=seed, weights=df["weights"])

df_small = randomly_select_n_examples(df, 15500, seed=27)
examples = [x[1]["message_histories"] for x in df_small.iterrows()]

with open("openhermes_15500.json", "w") as f:
    json.dump(obj=examples, fp=f)