## Helpsteer2 Analysis


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import matplotlib.ticker as mtick


FONT_SIZES = {"small": 14, "medium": 18, "large": 24}

PLOT_PARAMS = {
    "font.family": "serif",
    "font.serif": ["Times"],
    "font.size": FONT_SIZES.get("medium"),
    "axes.titlesize": FONT_SIZES.get("large"),
    "axes.labelsize": FONT_SIZES.get("large"),
    "xtick.labelsize": FONT_SIZES.get("large"),
    "ytick.labelsize": FONT_SIZES.get("large"),
    "legend.fontsize": FONT_SIZES.get("medium"),
    "figure.titlesize": FONT_SIZES.get("medium"),
    "text.usetex": True,
}

COLORS = {
    "pink": "#f0529c",
    "dark_teal": "#0a3235",
    "teal": "#105257",
    "purple": "#b11be8",
    "green": "#0fcb8c",
}


plt.rcParams.update(PLOT_PARAMS)

Let's load the optimal subset for Helpsteer2


In [None]:
df_swaps = pd.read_json("hs2_optimal.jsonl", lines=True)
df_feats = pd.read_json("features.jsonl", lines=True)
df = df_swaps.merge(df_feats, on="id")

And get the instances routed to GPT-4 and those routed to Humans


In [None]:
hs2_gpt4 = df[~df["is_swapped"]].reset_index(drop=True)
hs2_gpt4["rating_chosen"] = hs2_gpt4["rating_gpt4"].apply(lambda x: x[0])
hs2_gpt4["rating_rejected"] = hs2_gpt4["rating_gpt4"].apply(lambda x: x[1])

hs2_hums = df[df["is_swapped"]].reset_index(drop=True)
hs2_hums["rating_chosen"] = hs2_hums["rating_human"].apply(lambda x: x[0])
hs2_hums["rating_rejected"] = hs2_hums["rating_human"].apply(lambda x: x[1])

Some important questions:

- Using our top ten features, do we find large difference in counts or distribution?
- Can we find particular examples with high gain and low gain?


In [None]:
def get_top_n_expertise(df, n=10, normalize: bool = True):
    counts = (
        pd.DataFrame(
            [Counter([s for l in df.subject_of_expertise.to_list() for s in l])]
        )
        .T.sort_values(by=0, ascending=False)
        .rename(columns={0: "count"})
    )

    total = counts["count"].sum()
    print(f"Total num of instances: {total}")
    if normalize:
        counts["normalize"] = (counts["count"] / total) * 100

    return counts.head(n)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(17, 5))
get_top_n_expertise(hs2_gpt4).iloc[::-1].rename(
    {"Journalism, media studies and communication": "Journalism"}
).normalize.plot.barh(ax=axs[0], color=COLORS.get("teal"))
axs[0].set_title("Routed to GPT-4")
get_top_n_expertise(hs2_hums).iloc[::-1].normalize.plot.barh(
    ax=axs[1], color=COLORS.get("teal")
)
axs[1].set_title("Routed to Humans")

for ax in axs:
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.set_xlabel("\% of instances")
    ax.xaxis.set_major_formatter(mtick.PercentFormatter())

plt.tight_layout()
fig.savefig("hs2_soe.pdf", bbox_inches="tight")

In [None]:
def get_multipref_soe_rankings(df):
    multipref_human_soe = [
        "Chemical engineering",
        "Religion",
        "Anthropology",
        "Chemistry",
        "Visual arts",
        "Earth sciences",
        "Space sciences",
    ]
    multipref_gpt4_soe = [
        "Logic",
        "Transportation",
        "Architecture and design",
        "Materials science and engineering",
        "Library and museum studies",
        "Media studies and communication",
        "Military sciences",
        "Family and consumer science",
    ]

    df_soe = (
        pd.DataFrame(
            [Counter([s for l in df.subject_of_expertise.to_list() for s in l])]
        )
        .T.sort_values(by=0, ascending=False)
        .reset_index()
        .rename(columns={0: "count", "index": "subject"})
    )

    return (
        df_soe[df_soe["subject"].isin(multipref_human_soe)].reset_index(drop=True),
        df_soe[df_soe["subject"].isin(multipref_gpt4_soe)].reset_index(drop=True),
    )

Create a grouped bar chart for this


In [None]:
hs2_gpt4_elevel = hs2_gpt4.expertise_level.value_counts(normalize=True)[
    ["expert domain knowledge", "general public", "basic domain knowledge"]
].reset_index()
hs2_gpt4_elevel["routed_to"] = "gpt4"
hs2_hums_elevel = hs2_hums.expertise_level.value_counts(normalize=True).reset_index()
hs2_hums_elevel["routed_to"] = "human"
hs2_elevel = pd.concat([hs2_gpt4_elevel, hs2_hums_elevel]).reset_index(drop=True)
hs2_elevel["percentage"] = hs2_elevel["proportion"] * 100
hs2_elevel = hs2_elevel.replace({"gpt4": "GPT-4", "human": "Human"})

order = ["general public", "basic domain knowledge", "expert domain knowledge"]
pivot_df = hs2_elevel.pivot(
    index="expertise_level",
    columns="routed_to",
    values="percentage",
)
pivot_df = pivot_df.reindex(order)
pivot_df.index = [o.title() for o in order]

# Plotting the grouped bar chart
fig, ax = plt.subplots(1, 1, figsize=(7.5, 4))
pivot_df.plot(kind="bar", ax=ax, color=[COLORS.get("teal"), COLORS.get("pink")])

ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)

# Adding labels and title
ax.set_ylabel("")
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(
    # loc="lower right",
    frameon=False,
    title="Routed to...",
)

ax.set_xticklabels(
    ["General\npublic", "Basic domain\nknowledge", "Expert domain\nknowledge"],
    rotation=0,
)

plt.tight_layout()
fig.savefig("hs2_loe.pdf", bbox_inches="tight")

In [None]:
hs2_gpt4.open_endedness.value_counts(normalize=True)

In [None]:
from sklearn.metrics import cohen_kappa_score

human_annotations = hs2_hums["pref_human"].to_list()
gpt4_annotations = hs2_hums["pref_gpt4"].to_list()


pct_agreement = len(hs2_hums[hs2_hums["pref_human"] == hs2_hums["pref_gpt4"]]) / len(
    hs2_hums
)
score = cohen_kappa_score(human_annotations, gpt4_annotations)

print(pct_agreement, score)

In [None]:
hs2_gpt4.entity_sim.hist()

In [None]:
hs2_hums.entity_sim.hist()

In [None]:
hs2_gpt4.columns

In [None]:
hs2_hums.complexity_of_intents.value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
dim = "cosine_sim"
bins = 30
maxv = 400
hs2_hums[dim].hist(
    ax=ax,
    bins=bins,
    alpha=0.5,
    density=False,
    grid=False,
    label="Human",
    edgecolor=COLORS.get("pink"),
    color=COLORS.get("pink"),
)
hs2_gpt4[dim].hist(
    ax=ax,
    bins=bins,
    alpha=0.5,
    density=False,
    grid=False,
    label="GPT-4",
    edgecolor=COLORS.get("teal"),
    color=COLORS.get("teal"),
)
ax.vlines(0.33, 0, maxv, color="k", linestyles="--")
ax.vlines(0.67, 0, maxv, color="k", linestyles="--")
# ax.text(0.33 - 0.05, 0 - 50, "0.33")
# ax.text(0.67 - 0.05, 0 - 50, "0.67")
ax.set_xlim([0, 1])
ax.set_xticks([0, 0.33, 0.67, 1.0])
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.set_title("ROUGE-L" if dim == "rouge" else "Cosine similarity")
ax.legend(title="Routed to:", frameon=False, ncols=2, bbox_to_anchor=(1.05, -0.10))
ax.set_ylabel("Counts")
plt.tight_layout()
# hs2_hums[dim].plot.kde(alpha=0.5)
# hs2_gpt4[dim].plot.kde(alpha=0.5)

In [None]:
hs2_hums["bertscore_length"].value_counts()

In [None]:
dim = "prompt_len"
hs2_hums[dim].hist(alpha=0.5, bins=30)
hs2_gpt4[dim].hist(alpha=0.5, bins=30)

In [None]:
dim = "expertise_level"
print("human")
print(hs2_hums[dim].value_counts(normalize=True).to_markdown())
print("gpt4")
print(hs2_gpt4[dim].value_counts(normalize=True).to_markdown())

In [None]:
cs = hs2_gpt4[
    [
        "Computer sciences" in row["subject_of_expertise"]
        for _, row in hs2_gpt4.iterrows()
    ]
]

In [None]:
hs2_gpt4.expertise_level.value_counts()

In [None]:
Counter(
    [
        j
        for i in hs2_gpt4[
            hs2_gpt4.expertise_level == "basic domain knowledge"
        ].subject_of_expertise.to_list()
        for j in i
    ]
)

In [None]:
1504 / 2706

In [None]:
cs[cs.expertise_level == "basic domain knowledge"]["prompt"].sample(10).to_list()

In [None]:
cs[cs.expertise_level == "general public"]["prompt"].sample(10).to_list()

In [None]:
hs2_gpt4[hs2_gpt4["pref_human"] != hs2_gpt4["pref_gpt4"]].reset_index(drop=True).to_csv(
    "hs2_routed_to_gpt4_disagree.csv", index=False
)

In [None]:
humpref = hs2_gpt4["pref_human"]
gptpref = hs2_gpt4["pref_gpt4"]

In [None]:
sum(humpref == gptpref)

In [None]:
2038 / len(humpref)

In [None]:
cohen_kappa_score(humpref.to_list(), gptpref.to_list())