In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

from llm_ol.utils import sized_subplots

In [None]:
paths = {
    "Wikipedia (depth 3)": Path("out/data/wikipedia/v1/eval/metrics.json"),
    "Hearst": Path("out/experiments/hearst/v1/metrics.json"),
    "Prompting": Path("out/experiments/prompting/dev-h-v2/metrics.json"),
}
metrics = {k: json.loads(v.read_text()) for k, v in paths.items()}
out_dir = Path(f"out/graphs/{'-'.join(paths.keys())}")
out_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# Eigenspectrum
fig, axs = sized_subplots()
for name, m in metrics.items():
    eigenspectrum = np.array(m["eigenspectrum"])
    sns.lineplot(
        x=np.arange(1, eigenspectrum.size + 1),
        y=eigenspectrum,
        label=name,
        ax=axs[0, 0],
    )
_ = axs[0, 0].set(
    title="Eigenspectrum of the undirected, normalised Laplacian",
    xlabel="$i$\\textsuperscript{th} eigenvalue",
    ylabel="Value",
    xscale="log",
)
# fig.savefig(out_dir / "eigenspectrum.pdf")

In [None]:
# Centrality spectrum
fig, axs = sized_subplots()
for name, m in metrics.items():
    centrality_spectrum = [c for n, c in m["central_nodes"]]
    sns.lineplot(
        x=np.arange(1, len(centrality_spectrum) + 1),
        y=centrality_spectrum,
        label=name,
        ax=axs[0, 0],
    )
_ = axs[0, 0].set(
    title="Centrality of nodes",
    xlabel="$i$\textsuperscript{th} most central node",
    ylabel="Betweeness centrality",
    xscale="log",
    yscale="log",
)
# fig.savefig(out_dir / "centrality_spectrum.pdf")

In [None]:
# In- and out-degree distributions
fig, axs = sized_subplots(2, 2)
dfs = []
for name, m in metrics.items():
    dfs.append(
        pd.DataFrame(
            {"Name": name, "in_degree": m["in_degree"], "out_degree": m["out_degree"]}
        )
    )
df = pd.concat(dfs)
_ = sns.histplot(data=df, x="in_degree", hue="Name", ax=axs[0, 0], bins=40)
_ = sns.histplot(data=df, x="out_degree", hue="Name", ax=axs[0, 1], bins=40)
_ = axs[0, 0].set(
    title="In-degree distribution",
    xlabel="In-degree",
    ylabel="Frequency",
    yscale="log",
)
_ = axs[0, 1].set(
    title="Out-degree distribution",
    xlabel="Out-degree",
    ylabel="Frequency",
    yscale="log",
)
# fig.savefig(out_dir / "degree_distributions.pdf")

In [None]:
# Component size distribution
fig, axs = sized_subplots(2, 2)
for ax, key in zip(axs.flat, ["weakly_connected", "strongly_connected"]):
    dfs = []
    for name, m in metrics.items():
        dfs.append(pd.DataFrame({"Name": name, key: m[key]}))
    df = pd.concat(dfs)
    _ = sns.histplot(data=df, x=key, hue="Name", ax=ax, bins=40)
_ = axs[0, 0].set(
    title="Weakly connected component size distribution",
    xlabel="Size",
    ylabel="Frequency",
    yscale="log",
)
_ = axs[0, 1].set(
    title="Strongly connected component size distribution",
    xlabel="Size",
    ylabel="Frequency",
    yscale="log",
)
# fig.savefig(out_dir / "component_size_distributions.pdf")

In [None]:
# Distance distribution
dfs = []
for name, m in metrics.items():
    dfs.append(pd.DataFrame({"Name": name, "distance": m["distance"]}))
df = pd.concat(dfs)

fig, axs = sized_subplots()
sns.histplot(
    data=df,
    x="distance",
    hue="Name",
    stat="probability",
    discrete=True,
    common_norm=False,
    ax=axs[0, 0],
    bins=40,
)
_ = axs[0, 0].set(
    title="Distance distribution",
    xlabel="Distance",
    ylabel="Probability",
)

In [None]:
# Numerical metrics
# keys = ["num_nodes", "num_edges", "density", "diameter"]
data = []
for name, m in metrics.items():
    data.append(
        {
            "Name": name,
            "Root": m["central_nodes"][0][0],
            "Nodes": m["num_nodes"],
            "Edges": m["num_edges"],
            "Density": m["density"],
            "Diameter (est)": m["diameter"],
            "lambda_2": m["eigenspectrum"][1],
        }
    )
df = pd.DataFrame(data)
df