In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

import os

import config



In [None]:
proc = config.PROC_DATA_PATH

wclusterpath = "ExpertFeedback_WithinCluster.xlsx"
bwclusterpath = "ExpertFeedback_BetweenCluster.xlsx"

wclusterpath = os.path.join(proc, wclusterpath)
bwclusterpath = os.path.join(proc, bwclusterpath)

wcluster = pd.read_excel(wclusterpath)
clusterkl0 = pd.read_excel(bwclusterpath, sheet_name='KL0')

In [None]:
clusterkl1 = pd.read_excel(bwclusterpath, sheet_name='KL1')
clusterkl2 = pd.read_excel(bwclusterpath, sheet_name='KL2')
clusterkl3 = pd.read_excel(bwclusterpath, sheet_name='KL3')
clusterkl4 = pd.read_excel(bwclusterpath, sheet_name='KL4')

In [None]:
clusterkl = [clusterkl0,clusterkl1, clusterkl2, clusterkl3,clusterkl4]

In [None]:
wcluster.rename(columns={"Unnamed: 0": "KL-Score"}, inplace=True)

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)

# heatmap
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(wcluster.set_index("KL-Score"), annot=True, cmap = "flare", cbar_kws={'label': 'Similarity Rating'})
heatmap.set_title("Expert Feedback Within Clusters")
# plt.ylabel("KL-Score")
# plt.xlabel("Cluster")
plt.tight_layout()
plt.show()

In [None]:
wcluster_unpivot = wcluster.melt(id_vars=["KL-Score"], var_name="cluster", value_name="rating")

In [None]:
wcluster_unpivot = wcluster_unpivot.dropna()
wcluster_unpivot = wcluster_unpivot[wcluster_unpivot['rating'] != -1]

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(
    data=wcluster_unpivot,
    x="cluster",
    y="rating",
    palette="Set2"
)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(
    data=wcluster_unpivot,
    x="KL-Score",
    y="rating",
    palette="Set2",
    medianprops=dict(color="black", linewidth=2)
)
plt.tight_layout()
plt.show()

In [None]:
clusterkl0.rename(columns={"Unnamed: 0": "Cluster"}, inplace=True)
clusterkl1.rename(columns={"Unnamed: 0": "Cluster"}, inplace=True)
clusterkl2.rename(columns={"Unnamed: 0": "Cluster"}, inplace=True)
clusterkl3.rename(columns={"Unnamed: 0": "Cluster"}, inplace=True)
clusterkl4.rename(columns={"Unnamed: 0": "Cluster"}, inplace=True)

In [None]:
kl = [0, 1, 2, 3, 4]

fig= plt.figure(figsize=(15,15))
axes = fig.subplots(3,2)

for k in kl:
    #show heatmaps for each kl score
    ax = axes[k//2, k%2]
    cluster_df = eval(f"clusterkl{k}")
    heatmap = sns.heatmap(cluster_df.set_index("Cluster"), annot=True, cmap = "flare", ax=ax)
    heatmap.set_title(f"Expert Feedback Between Clusters for KL-{k}")
    heatmap.set_ylabel("Cluster")
    heatmap.set_xlabel("Cluster")   
#only show one cbar

    ax.collections[0].colorbar.remove()
    
plt.tight_layout()
plt.show()

In [None]:

clusters = ["Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3"]

palette = dict(
    zip(
        clusters,
        sns.color_palette("flare", n_colors=len(clusters))
    )
)

In [None]:
dfs_long = []

for k in kl:
    df = eval(f"clusterkl{k}")

    df = df.set_index("Cluster")
    df.index.name = "cluster_a"

    df_long = (
        df
        .stack()
        .reset_index()
        .rename(columns={
            "level_1": "cluster_b",
            0: "rating"
        })
        .dropna()
    )

    df_long["KL"] = k
    dfs_long.append(df_long)

long_df = pd.concat(dfs_long, ignore_index=True)


In [None]:
long_df_copy = long_df.copy()

long_df_copy['cluster_b2'] = long_df_copy['cluster_b']
long_df_copy['cluster_b'] = long_df_copy['cluster_a']
long_df_copy['cluster_a'] = long_df_copy['cluster_b2']

In [None]:
long_df_copy.drop(columns=['cluster_b2'], inplace=True)

In [None]:
print(long_df.shape, long_df_copy.shape)
long_df = pd.concat([long_df, long_df_copy], ignore_index=True)
print(long_df.shape)

In [None]:
long_df.columns

In [None]:
#remove duplicate rows
long_df = long_df.drop_duplicates(subset=['KL', 'cluster_a', 'cluster_b', 'rating'])

In [None]:
long_df.shape

In [None]:
kl_values = sorted(long_df["KL"].unique())

kl_palette = dict(
    zip(
        kl_values,
        sns.color_palette("Set3", n_colors=len(kl_values))
    )
)

# --- FIX 2: global cluster order for y-axis ---
cluster_b_order = sorted(long_df["cluster_b"].unique())

clusters = sorted(long_df["cluster_a"].unique())

# --- FIX 3: global x-axis limits ---
xmin = long_df["rating"].min()
xmax = long_df["rating"].max()

# --- FIX 4: share x-axis explicitly ---
fig, axes = plt.subplots(
    1, len(clusters),
   # 2, 2,
    figsize=(4*len(clusters), 6),
    sharey=True,
    sharex=True   # ðŸ”‘ ensures identical x-axis
)

# for ax in axes.flatten():
#     ax.grid(False)

for ax, cluster in zip(axes.flatten(), clusters):
    subset = long_df[long_df["cluster_a"] == cluster]

    sns.barplot(
        data=subset,
        x="rating",
        y="cluster_b",
        hue="KL",
        palette=kl_palette,
        order=cluster_b_order,
        ax=ax,
        legend=False
    )

    ax.set_title(cluster)
    ax.set_xlabel("Similarity rating")
    ax.set_ylabel("")
    ax.set_xlim(xmin, xmax)

    # ðŸ”‘ draw category separators
    n_cats = len(cluster_b_order)
    for y in range(1, n_cats):
        ax.axhline(y=y - 0.5, color="lightgray", lw=1, zorder=0)

# --- Global legend (unchanged) ---
handles = [
    Patch(facecolor=kl_palette[k], label=f"KL {k}")
    for k in kl_values
]

fig.legend(
    handles=handles,
    title="KL score",
    loc="upper center",
    ncol=len(kl_values),
    frameon=False
)

plt.tight_layout(rect=[0, 0, 1, 0.9])
plt.show()

In [None]:
long_df

In [None]:
fig, axes = plt.subplots(
    1, len(clusters),
    figsize=(4 * len(clusters), 4),
    sharey=True
)

for ax, cluster in zip(axes, clusters):
    subset = long_df[long_df["cluster_a"] == cluster]

    subset = subset.sort_values(by="cluster_b", ascending=True)

    sns.pointplot(
        data=subset,
        x="rating",
        y="cluster_b",
        hue="KL",
        palette=kl_palette,
        dodge=True,
        join=False,
        ax=ax,
        legend=False
    )


In [None]:
for ax, k in zip(axes, kl):
    df = long_df[long_df["KL"] == k]

    pivot = df.pivot(
        index="cluster_a",
        columns="cluster_b",
        values="rating"
    )

    sns.heatmap(
        pivot,
        cmap="viridis",
        annot=True,
        ax=ax,
        cbar=(ax == axes[-1])
    )

    ax.set_title(f"KL {k}")
plt.show()

In [None]:
def summarize(series):
    series = pd.to_numeric(series, errors="coerce").dropna()
    return {
        "n": int(series.count()),
        "mean": float(series.mean()) if len(series) else np.nan,
        "median": float(series.median()) if len(series) else np.nan,
        "min": float(series.min()) if len(series) else np.nan,
        "max": float(series.max()) if len(series) else np.nan,
    }

def clean_ratings(x):
    """Convert to numeric; keep -1 as -1 for counting; NaNs stay NaN."""
    return pd.to_numeric(x, errors="coerce")



In [None]:
within_df = wcluster
within_df.head()

In [None]:
within_mat = within_df.applymap(clean_ratings)

# Long format for within
within_long = (
    within_mat.reset_index()
    .melt(id_vars=within_mat.index.name or "index", var_name="cluster", value_name="rating")
    .rename(columns={within_mat.index.name or "index": "kl_group"})
)

within_long["is_missing"] = within_long["rating"].isna()
within_long["is_not_rateable"] = within_long["rating"].eq(-1)
within_long["is_rateable"] = (~within_long["is_missing"]) & (~within_long["is_not_rateable"])

within_rateable = within_long.loc[within_long["is_rateable"], "rating"]


In [None]:
within_long = within_long[within_long['cluster']!='KL-Score']

In [None]:
within_long["rating"] = pd.to_numeric(within_long["rating"], errors="coerce")


In [None]:
within_missing = within_long["rating"].isna()
within_not_rateable = within_long["rating"] == -1
within_rateable = within_long.loc[
    (~within_missing) & (~within_not_rateable), "rating"
]

print("\n=== WITHIN-CLUSTER (intra-cluster) ===")
print("Total cells:", len(within_long))
print("Missing (NaN):", int(within_missing.sum()))
print("Not rateable (-1):", int(within_not_rateable.sum()))
print("Rateable:", int(len(within_rateable)))
print("Summary:", summarize(within_rateable))

In [None]:
within_by_cluster = (
    within_long
    .loc[(within_long["rating"].notna()) & (within_long["rating"] != -1)]
    .groupby("cluster")["rating"]
    .agg(["count", "mean", "median", "min", "max"])
    .reset_index()
)

print("\nWithin-cluster summary by CLUSTER")
print(within_by_cluster.to_string(index=False))

In [None]:
within_long

In [None]:
within_plot = within_long[
    (within_long["rating"].notna()) &
    (within_long["rating"] != -1)
]


In [None]:
plt.figure(figsize=(8, 5))

sns.boxplot(
    data=within_plot,
    x="cluster",
    y="rating",
    showfliers=True,
    palette = 'Set2'
)

# overlay mean
means = within_plot.groupby("cluster")["rating"].mean()

plt.scatter(
    x=range(len(means.values)),
    y=means.values,
    marker="D",
    s=60,
    label="Mean",
    zorder = 10
)

plt.ylabel("Expert similarity rating")
plt.xlabel("Cluster")
plt.title("Within-cluster similarity")

plt.legend()
plt.tight_layout()
plt.show()


In [None]:
within_by_kl = (
    within_long
    .loc[(within_long["rating"].notna()) & (within_long["rating"] != -1)]
    .groupby("kl_group")["rating"]
    .agg(["count", "mean", "median", "min", "max"])
    .reset_index()
)

print("\nWithin-cluster summary by KL GROUP")
print(within_by_kl.to_string(index=False))


In [None]:
plt.figure(figsize=(9, 5))

sns.barplot(
    data=within_plot,
    x="cluster",
    y="rating",
    hue="kl_group",
    # showfliers=True,
    palette="Set2"
)

# mean per (cluster, KL)
# means = (
#     within_plot
#     .groupby(["cluster", "kl_group"])["rating"]
#     .mean()
#     .reset_index()
# )

# # positions for mean markers
# for i, cluster in enumerate(means["cluster"].unique()):
#     cluster_data = means[means["cluster"] == cluster]
#     for j, (_, row) in enumerate(cluster_data.iterrows()):
#         plt.scatter(
#             i + (-0.25 + j * 0.25),
#             row["rating"],
#             marker="D",
#             s=60,
#             color="black",
#             zorder=10
#         )

plt.xlabel("Cluster")
plt.ylabel("Expert similarity rating")
plt.title("Within-cluster similarity stratified by KL score")
plt.legend(
    title="KL score",
    bbox_to_anchor=(1.02, 1),
    loc="upper left",
    borderaxespad=0
)

plt.tight_layout()
plt.show()


In [None]:
between_rows = []

for sheet in range(0, 5):

    mat = eval(f"clusterkl{sheet}")

    # VERY IMPORTANT
    mat = mat.set_index("Cluster")

    for r in mat.index:
        for c in mat.columns:

            val = mat.loc[r, c]

            if pd.isna(val):
                continue

            # skip diagonal
            if str(r).strip() == str(c).strip():
                continue

            between_rows.append({
                "kl_sheet": sheet,
                "cluster_a": r,
                "cluster_b": c,
                "rating": val
            })

between_long = pd.DataFrame(between_rows)


# # between_long = pd.DataFrame(between_rows)
# between_long["is_not_rateable"] = between_long["rating"].eq(-1)
# between_long["is_rateable"] = ~between_long["is_not_rateable"]
# between_rateable = between_long.loc[between_long["is_rateable"], "rating"]

# # Make a canonical (unordered) cluster pair label for grouping
# def canonical_pair(a, b):
#     a, b = str(a).strip(), str(b).strip()
#     return tuple(sorted([a, b]))

# between_long["pair"] = between_long.apply(lambda r: canonical_pair(r["cluster_a"], r["cluster_b"]), axis=1)


In [None]:
between_plot = between_long[
    (between_long["rating"].notna()) &
    (between_long["rating"] != -1)
].copy()

between_plot["cluster_min"] = between_plot[["cluster_a", "cluster_b"]].min(axis=1)
between_plot["cluster_max"] = between_plot[["cluster_a", "cluster_b"]].max(axis=1)

pair_stats = (
    between_plot
    .groupby(["cluster_min", "cluster_max"])["rating"]
    .agg(["mean", "std"])
    .reset_index()
)

clusters = sorted(
    set(pair_stats["cluster_min"]).union(pair_stats["cluster_max"])
)

mean_matrix = pd.DataFrame(index=clusters, columns=clusters, dtype=float)
std_matrix  = pd.DataFrame(index=clusters, columns=clusters, dtype=float)

for _, row in pair_stats.iterrows():
    a = row["cluster_min"]
    b = row["cluster_max"]

    mean_matrix.loc[a, b] = row["mean"]
    mean_matrix.loc[b, a] = row["mean"]

    std_matrix.loc[a, b] = row["std"]
    std_matrix.loc[b, a] = row["std"]

annot_matrix = mean_matrix.copy().astype(str)

for i in mean_matrix.index:
    for j in mean_matrix.columns:
        m = mean_matrix.loc[i, j]
        s = std_matrix.loc[i, j]

        if pd.isna(m):
            annot_matrix.loc[i, j] = ""
        elif pd.isna(s):
            annot_matrix.loc[i, j] = f"{m:.2f}"
        else:
            annot_matrix.loc[i, j] = f"{m:.2f} Â± {s:.2f}"


In [None]:
plt.figure(figsize=(12, 10))

sns.heatmap(
    mean_matrix,
    annot=annot_matrix,
    fmt="",
    cmap="viridis",
    linewidths=0.5,
    square=True,
    cbar_kws={"label": "Mean similarity"}
)

plt.title("Between-cluster similarity (mean Â± SD)")
plt.xlabel("Cluster B")
plt.ylabel("Cluster A")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

sns.boxplot(
    data=within_plot,
    x="cluster",
    y="rating",
    showfliers=True,
    palette = 'Set2'
)

# overlay mean
means = within_plot.groupby("cluster")["rating"].mean()

plt.scatter(
    x=range(len(means.values)),
    y=means.values,
    marker="D",
    s=60,
    label="Mean",
    zorder = 10
)

plt.ylabel("Expert similarity rating")
plt.xlabel("Cluster")
plt.title("Within-cluster similarity")

plt.legend()
plt.tight_layout()
plt.show()



In [None]:

plt.figure(figsize=(9, 5))

sns.barplot(
    data=within_plot,
    x="cluster",
    y="rating",
    hue="kl_group",
    # showfliers=True,
    palette="Set2"
)

# mean per (cluster, KL)
# means = (
#     within_plot
#     .groupby(["cluster", "kl_group"])["rating"]
#     .mean()
#     .reset_index()
# )

# # positions for mean markers
# for i, cluster in enumerate(means["cluster"].unique()):
#     cluster_data = means[means["cluster"] == cluster]
#     for j, (_, row) in enumerate(cluster_data.iterrows()):
#         plt.scatter(
#             i + (-0.25 + j * 0.25),
#             row["rating"],
#             marker="D",
#             s=60,
#             color="black",
#             zorder=10
#         )

plt.xlabel("Cluster")
plt.ylabel("Expert similarity rating")
plt.title("Within-cluster similarity stratified by KL score")
plt.legend(
    title="KL score",
    bbox_to_anchor=(1.02, 1),
    loc="upper left",
    borderaxespad=0
)

plt.tight_layout()
plt.show()

In [None]:
between_missing = between_long["rating"].isna()
between_not_rateable = between_long["rating"] == -1
between_rateable = between_long.loc[
    (~between_missing) & (~between_not_rateable)
]

print("\n=== BETWEEN-CLUSTER (inter-cluster) ===")
print("Total comparisons:", len(between_long))
print("Not rateable (-1):", int(between_not_rateable.sum()))
print("Rateable:", int(len(between_rateable)))
print("Summary:", summarize(between_rateable["rating"]))

# ---- canonical cluster pairs
between_rateable["pair"] = between_rateable.apply(
    lambda r: tuple(sorted([r["cluster_a"], r["cluster_b"]])),
    axis=1
)


In [None]:
between_by_kl = (
    between_rateable
    .groupby("kl_sheet")["rating"]
    .agg(["count", "mean", "median", "min", "max"])
    .reset_index()
)

print("\nBetween-cluster summary by KL GROUP")
print(between_by_kl.to_string(index=False))

# ---- by cluster pair
between_by_pair = (
    between_rateable
    .groupby("pair")["rating"]
    .agg(["count", "mean", "median", "min", "max"])
    .reset_index()
)

print("\nBetween-cluster summary by CLUSTER PAIR")
print(between_by_pair.to_string(index=False))

In [None]:

print("\n=== WITHIN-CLUSTER (intra-cluster) ===")
print("Total cells:", len(within_long))
print("Missing (blank/NaN):", int(within_long["is_missing"].sum()))
print("Not rateable (-1):", int(within_long["is_not_rateable"].sum()))
print("Rateable:", int(within_long["is_rateable"].sum()))
print("Summary (rateable only):", summarize(within_rateable))

print("\nWithin-cluster summary by CLUSTER (rateable only):")
within_by_cluster = (
    within_long[within_long["is_rateable"]]
    .groupby("cluster")["rating"]
    .apply(lambda s: pd.Series(summarize(s)))
    .reset_index()
)
print(within_by_cluster.to_string(index=False))

print("\nWithin-cluster summary by KL GROUP (rateable only):")
within_by_kl = (
    within_long[within_long["is_rateable"]]
    .groupby("kl_group")["rating"]
    .apply(lambda s: pd.Series(summarize(s)))
    .reset_index()
)
print(within_by_kl.to_string(index=False))

print("\n=== BETWEEN-CLUSTER (inter-cluster) ===")
print("Total extracted comparisons:", len(between_long))
print("Not rateable (-1):", int(between_long["is_not_rateable"].sum()))
print("Rateable:", int(between_long["is_rateable"].sum()))
print("Summary (rateable only):", summarize(between_rateable))

print("\nBetween-cluster summary by KL SHEET (rateable only):")
between_by_kl = (
    between_long[between_long["is_rateable"]]
    .groupby("kl_sheet")["rating"]
    .apply(lambda s: pd.Series(summarize(s)))
    .reset_index()
)
print(between_by_kl.to_string(index=False))

print("\nBetween-cluster summary by CLUSTER PAIR (rateable only):")
between_by_pair = (
    between_long[between_long["is_rateable"]]
    .groupby("pair")["rating"]
    .apply(lambda s: pd.Series(summarize(s)))
    .reset_index()
)
print(between_by_pair.to_string(index=False))

# -----------------------------
# Optional: save long/tidy data for reporting/plots
# -----------------------------
within_long.to_csv("within_long.csv", index=False)
between_long.to_csv("between_long.csv", index=False)
print("\nSaved: within_long.csv, between_long.csv")


In [None]:
between_long